]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
91447636 | 2 | * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
1c79356b | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b A |
27 | */ |
28 | ; | |
91447636 | 29 | ; Copy bytes of data around. Handles overlapped data. |
1c79356b | 30 | ; |
1c79356b A |
31 | ; |
32 | #include <ppc/asm.h> | |
33 | #include <ppc/proc_reg.h> | |
55e303ae | 34 | #include <assym.s> |
1c79356b | 35 | |
91447636 A |
36 | ; These routines use CR5 for certain flags: |
37 | ; Use CR5_lt to indicate non-cached (in bcopy and memcpy) | |
1c79356b | 38 | #define noncache 20 |
55e303ae | 39 | |
55e303ae | 40 | |
91447636 A |
41 | ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine. |
42 | #define BCOPY_SF_SIZE 32 // total size | |
43 | #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP) | |
1c79356b | 44 | |
1c79356b | 45 | |
91447636 | 46 | #define kShort 32 // short operands are special cased |
1c79356b | 47 | |
1c79356b | 48 | |
91447636 A |
49 | ; void bcopy_physvir_32(from, to, nbytes) |
50 | ; | |
9bccf70c | 51 | ; Attempt to copy physically addressed memory with translation on if conditions are met. |
55e303ae A |
52 | ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors |
53 | ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs | |
54 | ; for the passed phys addrs and do the copy with translation on. | |
9bccf70c | 55 | ; |
91447636 A |
56 | ; Rules are: - neither source nor destination can cross a page. |
57 | ; - Interrupts must be disabled when this routine is called. | |
58 | ; - Translation must be on when called. | |
9bccf70c | 59 | ; |
91447636 A |
60 | ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one |
61 | ; is loaded. We do not touch the IBATs, so there is no issue if either physical page | |
9bccf70c A |
62 | ; address is the same as the virtual address of the instructions we are executing. |
63 | ; | |
55e303ae A |
64 | ; At the end, we invalidate the used DBATs. |
65 | ; | |
66 | ; Note that the address parameters are long longs. We will transform these to 64-bit | |
67 | ; values. Note that on 32-bit architectures that this will ignore the high half of the | |
68 | ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses | |
69 | ; there anyhow. | |
9bccf70c | 70 | ; |
91447636 A |
71 | ; Note also that this routine is used only on 32-bit machines. If you're contemplating use |
72 | ; on a 64-bit processor, use the physical memory window instead; please refer to copypv() | |
73 | ; for an example of how this is done. | |
9bccf70c A |
74 | |
75 | .align 5 | |
91447636 | 76 | .globl EXT(bcopy_physvir_32) |
9bccf70c | 77 | |
91447636 A |
78 | LEXT(bcopy_physvir_32) |
79 | mflr r0 ; get return address | |
55e303ae | 80 | rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg |
91447636 A |
81 | mfsprg r8,2 ; get processor feature flags |
82 | stw r0,8(r1) ; save return address | |
55e303ae | 83 | rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits |
91447636 | 84 | stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy |
55e303ae | 85 | mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test |
91447636 A |
86 | subi r0,r7,1 ; get length - 1 |
87 | rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg | |
88 | add r11,r3,r0 ; Point to last byte of sink | |
55e303ae | 89 | mr r5,r7 ; Get the length into the right register |
91447636 A |
90 | rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits |
91 | ||
92 | ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input | |
93 | ; to bcopy_physvir() on 32-bit machines, and will result in a panic. | |
94 | ||
9bccf70c | 95 | add r12,r4,r0 ; Point to last byte of source |
9bccf70c A |
96 | xor r7,r11,r3 ; See if we went to next page |
97 | xor r8,r12,r4 ; See if we went to next page | |
98 | or r0,r7,r8 ; Combine wrap | |
99 | ||
55e303ae A |
100 | // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes |
101 | li r9,((2<<3)|2) ; Set default attributes | |
9bccf70c A |
102 | rlwinm. r0,r0,0,0,19 ; Did we overflow a page? |
103 | li r7,2 ; Set validity flags | |
104 | li r8,2 ; Set validity flags | |
55e303ae | 105 | bne- bcopy_phys1 ; Overflowed page, do normal physical copy... |
9bccf70c | 106 | |
9bccf70c A |
107 | rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value |
108 | rlwimi r12,r9,0,15,31 ; Set source lower DBAT value | |
109 | rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value | |
110 | rlwimi r8,r12,0,0,14 ; Set source upper DBAT value | |
111 | cmplw cr1,r11,r12 ; See if sink and source are same block | |
112 | ||
113 | sync | |
114 | ||
115 | mtdbatl 0,r11 ; Set sink lower DBAT | |
116 | mtdbatu 0,r7 ; Set sink upper DBAT | |
117 | ||
118 | beq- cr1,bcpvsame ; Source and sink are in same block | |
119 | ||
120 | mtdbatl 1,r12 ; Set source lower DBAT | |
121 | mtdbatu 1,r8 ; Set source upper DBAT | |
91447636 | 122 | |
e5568f75 | 123 | bcpvsame: |
91447636 | 124 | sync ; wait for the BATs to stabilize |
e5568f75 | 125 | isync |
91447636 A |
126 | |
127 | bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on | |
128 | ||
129 | li r0,0 ; Get set to invalidate upper half of BATs | |
130 | sync ; Make sure all is well | |
131 | mtdbatu 0,r0 ; Clear sink upper DBAT | |
132 | mtdbatu 1,r0 ; Clear source upper DBAT | |
133 | sync | |
134 | isync | |
135 | ||
136 | lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address | |
137 | addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame | |
138 | mtlr r0 | |
139 | blr | |
140 | ||
9bccf70c | 141 | |
1c79356b | 142 | ; void bcopy_phys(from, to, nbytes) |
91447636 A |
143 | ; |
144 | ; Turns off data translation before the copy. This one will not work in user state. | |
145 | ; This routine is used on 32 and 64-bit machines. | |
55e303ae A |
146 | ; |
147 | ; Note that the address parameters are long longs. We will transform these to 64-bit | |
148 | ; values. Note that on 32-bit architectures that this will ignore the high half of the | |
149 | ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses | |
150 | ; there anyhow. | |
151 | ; | |
152 | ; Also note that you probably will not be happy if either the sink or source spans across the | |
153 | ; boundary between RAM and I/O space. Good chance of hanging the machine and this code | |
154 | ; will not check, so be careful. | |
1c79356b | 155 | ; |
91447636 A |
156 | ; NOTE: when called, translation must be on, and we must be in 32-bit mode. |
157 | ; Interrupts may or may not be disabled. | |
1c79356b | 158 | |
9bccf70c A |
159 | .align 5 |
160 | .globl EXT(bcopy_phys) | |
161 | ||
162 | LEXT(bcopy_phys) | |
91447636 | 163 | mflr r0 ; get return address |
55e303ae | 164 | rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg |
91447636 | 165 | stw r0,8(r1) ; save |
55e303ae | 166 | mfsprg r8,2 ; get processor feature flags |
91447636 | 167 | stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy |
55e303ae A |
168 | rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits |
169 | rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg | |
170 | mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test | |
171 | rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits | |
172 | mr r5,r7 ; Get the length into the right register | |
91447636 A |
173 | |
174 | bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5 | |
1c79356b | 175 | mfmsr r9 ; Get the MSR |
91447636 A |
176 | lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable |
177 | ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR | |
178 | andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off | |
55e303ae A |
179 | bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) |
180 | ||
181 | ; 32-bit CPUs | |
91447636 A |
182 | |
183 | mtmsr r9 ; turn DR, FP, and VEC off | |
1c79356b A |
184 | isync ; Wait for it |
185 | ||
91447636 | 186 | bl EXT(bcopy) ; do the copy with translation off and caching on |
55e303ae | 187 | |
91447636 A |
188 | mfmsr r9 ; Get the MSR |
189 | ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off) | |
190 | mtmsr r9 ; restore msr | |
191 | isync ; wait for it to happen | |
192 | lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on | |
193 | mtlr r0 | |
194 | addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame | |
195 | blr | |
55e303ae | 196 | |
91447636 A |
197 | |
198 | ; 64-bit: turn DR off and SF on. | |
55e303ae | 199 | |
91447636 A |
200 | bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off |
201 | ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller | |
202 | srdi r2,r3,31 ; Get a 1 if source is in I/O memory | |
55e303ae | 203 | li r0,1 ; Note - we use this in a couple places below |
91447636 A |
204 | srdi r10,r4,31 ; Get a 1 if sink is in I/O memory |
205 | std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on | |
206 | rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with | |
207 | cmpldi cr0,r2,1 ; Is source in I/O memory? | |
208 | cmpldi cr7,r10,1 ; Is sink in I/O memory? | |
55e303ae | 209 | mtmsrd r9 ; turn 64-bit addressing on, data translation off |
55e303ae | 210 | isync ; wait for it to happen |
91447636 A |
211 | cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area |
212 | beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space | |
213 | ||
214 | bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled | |
215 | ||
216 | bcopy_phys64x: | |
217 | mfmsr r9 ; Get the MSR we used to copy | |
218 | rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF | |
219 | ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on | |
220 | mtmsrd r9 ; turn 64-bit mode off, translation back on | |
221 | isync ; wait for it to happen | |
222 | lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on | |
223 | ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on | |
224 | mtlr r0 | |
225 | mtmsrd r8,1 ; turn EE back on if necessary | |
226 | addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame | |
227 | blr | |
55e303ae | 228 | |
91447636 A |
229 | ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3, |
230 | ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access. | |
231 | ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in | |
232 | ; this state, so we have to disable interrupts as well. This is an unpleasant hack. | |
233 | ||
234 | io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with | |
235 | sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit | |
236 | sldi r0,r0,32+8 ; Get the right bit to turn off caching | |
237 | andc r9,r9,r11 ; Turn off EE bit | |
238 | mfspr r2,hid4 ; Get HID4 | |
239 | mtmsrd r9,1 ; Force off EE | |
240 | or r2,r2,r0 ; Set bit to make real accesses cache-inhibited | |
241 | sync ; Sync up | |
242 | mtspr hid4,r2 ; Make real accesses cache-inhibited | |
243 | isync ; Toss prefetches | |
244 | ||
245 | lis r12,0xE000 ; Get the unlikeliest ESID possible | |
246 | srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 | |
247 | slbie r12 ; Make sure the ERAT is cleared | |
55e303ae | 248 | |
91447636 A |
249 | sync |
250 | isync | |
55e303ae | 251 | |
91447636 | 252 | bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited |
55e303ae | 253 | |
91447636 A |
254 | li r0,1 ; Get a 1 |
255 | sldi r0,r0,32+8 ; Get the right bit to turn off caching | |
256 | mfspr r2,hid4 ; Get HID4 | |
257 | andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited | |
258 | sync ; Sync up | |
259 | mtspr hid4,r2 ; Make real accesses not cache-inhibited | |
260 | isync ; Toss prefetches | |
261 | ||
262 | lis r12,0xE000 ; Get the unlikeliest ESID possible | |
263 | srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 | |
264 | slbie r12 ; Make sure the ERAT is cleared | |
265 | b bcopy_phys64x | |
266 | ||
1c79356b | 267 | |
91447636 A |
268 | ; |
269 | ; shortcopy | |
270 | ; | |
271 | ; Special case short operands (<32 bytes), which are very common. Note that the check for | |
272 | ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in | |
273 | ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases | |
274 | ; is similar. We do get the direction right when it counts (ie, when the operands overlap.) | |
275 | ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has | |
276 | ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency, | |
277 | ; and using word instead of doubleword moves reduces the possibility of unaligned accesses, | |
278 | ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we | |
279 | ; might do unaligned accesses this code cannot be called from bcopy_nc(). | |
280 | ; r4 = destination | |
281 | ; r5 = length (<32) | |
282 | ; r6 = source | |
283 | ; r12 = (dest - source) | |
284 | ||
285 | .align 5 | |
286 | shortcopy: | |
287 | cmplw r12,r5 ; must move reverse if (dest-source)<length | |
288 | mtcrf 2,r5 ; move length to cr6 and cr7 one at a time... | |
289 | mtcrf 1,r5 ; ...which is faster on G4 and G5 | |
290 | bge++ backend ; handle forward moves (most common case) | |
291 | add r6,r6,r5 ; point one past end of operands in reverse moves | |
292 | add r4,r4,r5 | |
293 | b bbackend ; handle reverse moves | |
294 | ||
1c79356b A |
295 | ; |
296 | ; void bcopy(from, to, nbytes) | |
297 | ; | |
91447636 A |
298 | ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set. |
299 | ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it | |
300 | ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp" | |
301 | ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point | |
302 | ; registers to accelerate the copy. | |
303 | ; | |
304 | ; NOTE: this code can be called in any of three "modes": | |
305 | ; - on 32-bit processors (32-byte cache line) | |
306 | ; - on 64-bit processors running in 32-bit mode (128-byte cache line) | |
307 | ; - on 64-bit processors running in 64-bit mode (128-byte cache line) | |
1c79356b | 308 | |
9bccf70c A |
309 | .align 5 |
310 | .globl EXT(bcopy) | |
91447636 | 311 | .globl EXT(bcopy_nop_if_32bit) |
9bccf70c A |
312 | |
313 | LEXT(bcopy) | |
91447636 A |
314 | cmplwi cr1,r5,kShort ; less than 32 bytes? |
315 | sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check | |
316 | mr r6,r3 ; Set source (must preserve r3 for memcopy return) | |
317 | blt cr1,shortcopy ; special case short operands | |
1c79356b | 318 | crclr noncache ; Set cached |
91447636 A |
319 | LEXT(bcopy_nop_if_32bit) |
320 | bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor) | |
321 | bne+ copyit32 ; handle 32-bit processor | |
322 | blr ; to==from so nothing to do | |
323 | ||
324 | ; | |
325 | ; bcopy_nc(from, to, nbytes) | |
326 | ; | |
327 | ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions. | |
328 | ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take | |
329 | ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw. | |
330 | ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode. | |
1c79356b | 331 | |
91447636 A |
332 | .align 5 |
333 | .globl EXT(bcopy_nc) | |
334 | .globl EXT(bcopy_nc_nop_if_32bit) | |
335 | ||
336 | LEXT(bcopy_nc) | |
55e303ae | 337 | cmpwi cr1,r5,0 ; Check if we have a 0 length |
91447636 A |
338 | sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check |
339 | mr r6,r3 ; Set source (must preserve r3 for memcopy return) | |
340 | crset noncache ; Set non-cached | |
341 | cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from | |
342 | LEXT(bcopy_nc_nop_if_32bit) | |
343 | bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor) | |
344 | bne+ copyit32 ; handle 32-bit processor | |
345 | blr ; either zero length or to==from | |
1c79356b A |
346 | |
347 | ; | |
91447636 A |
348 | ; void* memcpy(to, from, nbytes) |
349 | ; void* memmove(to, from, nbytes) | |
55e303ae | 350 | ; |
91447636 A |
351 | ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors. |
352 | ; However, they would work correctly if called in 64-bit mode. | |
353 | ||
9bccf70c A |
354 | .align 5 |
355 | .globl EXT(memcpy) | |
91447636 A |
356 | .globl EXT(memmove) |
357 | .globl EXT(memcpy_nop_if_32bit) | |
358 | ||
9bccf70c | 359 | LEXT(memcpy) |
91447636 A |
360 | LEXT(memmove) |
361 | cmplwi cr1,r5,kShort ; less than 32 bytes? | |
362 | sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check | |
363 | mr r6,r4 ; Set source | |
364 | mr r4,r3 ; Set the "to" (must preserve r3 for return value) | |
365 | blt cr1,shortcopy ; special case short operands | |
1c79356b | 366 | crclr noncache ; Set cached |
91447636 A |
367 | LEXT(memcpy_nop_if_32bit) |
368 | bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor) | |
369 | beqlr- ; exit if to==from | |
1c79356b | 370 | |
91447636 A |
371 | |
372 | ; Here to copy on 32-bit processors. | |
1c79356b | 373 | ; |
91447636 A |
374 | ; When we move the memory, forward overlays must be handled. We |
375 | ; also can not use the cache instructions if we are from bcopy_nc. | |
376 | ; We need to preserve R3 because it needs to be returned for memcpy. | |
377 | ; We can be interrupted and lose control here. | |
1c79356b | 378 | ; |
91447636 A |
379 | ; When entered: |
380 | ; r4 = destination | |
381 | ; r5 = length (>0) | |
382 | ; r6 = source | |
383 | ; r12 = (dest - source) | |
384 | ; cr5 = noncache flag | |
385 | ||
386 | copyit32: ; WARNING! can drop down to this label | |
387 | cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length | |
388 | cntlzw r11,r5 ; get magnitude of length | |
389 | dcbt 0,r6 ; start to touch in source | |
390 | lis r10,hi16(0x80000000) ; get 0x80000000 | |
391 | neg r9,r4 ; start to get alignment for destination | |
392 | dcbtst 0,r4 ; start to touch in destination | |
393 | sraw r8,r10,r11 ; get mask based on operand length, to limit alignment | |
394 | blt- cr1,reverse32bit ; reverse move required | |
395 | ||
396 | ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines. | |
397 | ; NOTE: we never do an unaligned access if the source and destination are "relatively" | |
398 | ; word aligned. We depend on this in the uncached case on 64-bit processors. | |
399 | ; r4 = destination | |
400 | ; r5 = length (>0) | |
401 | ; r6 = source | |
402 | ; r8 = inverse of largest mask smaller than operand length | |
403 | ; r9 = neg(dest), used to compute alignment | |
404 | ; cr5 = noncache flag | |
405 | ||
406 | forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands | |
407 | rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination | |
408 | andc. r0,r7,r8 ; limit to the maximum front end move | |
409 | mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... | |
1c79356b A |
410 | beq alline ; Already on a line... |
411 | ||
91447636 A |
412 | mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 |
413 | sub r5,r5,r0 ; Set the length left to move | |
414 | ||
1c79356b A |
415 | bf 31,alhalf ; No single byte to do... |
416 | lbz r7,0(r6) ; Get the byte | |
417 | addi r6,r6,1 ; Point to the next | |
418 | stb r7,0(r4) ; Save the single | |
419 | addi r4,r4,1 ; Bump sink | |
420 | ||
421 | ; Sink is halfword aligned here | |
422 | ||
423 | alhalf: bf 30,alword ; No halfword to do... | |
424 | lhz r7,0(r6) ; Get the halfword | |
425 | addi r6,r6,2 ; Point to the next | |
426 | sth r7,0(r4) ; Save the halfword | |
427 | addi r4,r4,2 ; Bump sink | |
428 | ||
429 | ; Sink is word aligned here | |
430 | ||
431 | alword: bf 29,aldouble ; No word to do... | |
432 | lwz r7,0(r6) ; Get the word | |
433 | addi r6,r6,4 ; Point to the next | |
434 | stw r7,0(r4) ; Save the word | |
435 | addi r4,r4,4 ; Bump sink | |
436 | ||
437 | ; Sink is double aligned here | |
438 | ||
439 | aldouble: bf 28,alquad ; No double to do... | |
440 | lwz r7,0(r6) ; Get the first word | |
441 | lwz r8,4(r6) ; Get the second word | |
442 | addi r6,r6,8 ; Point to the next | |
443 | stw r7,0(r4) ; Save the first word | |
444 | stw r8,4(r4) ; Save the second word | |
445 | addi r4,r4,8 ; Bump sink | |
446 | ||
447 | ; Sink is quadword aligned here | |
448 | ||
449 | alquad: bf 27,alline ; No quad to do... | |
450 | lwz r7,0(r6) ; Get the first word | |
451 | lwz r8,4(r6) ; Get the second word | |
452 | lwz r9,8(r6) ; Get the third word | |
453 | stw r7,0(r4) ; Save the first word | |
454 | lwz r11,12(r6) ; Get the fourth word | |
455 | addi r6,r6,16 ; Point to the next | |
456 | stw r8,4(r4) ; Save the second word | |
457 | stw r9,8(r4) ; Save the third word | |
458 | stw r11,12(r4) ; Save the fourth word | |
459 | addi r4,r4,16 ; Bump sink | |
460 | ||
461 | ; Sink is line aligned here | |
462 | ||
463 | alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move | |
91447636 A |
464 | mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... |
465 | mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 | |
1c79356b | 466 | beq- backend ; No full lines to move |
91447636 A |
467 | |
468 | mtctr r0 ; set up loop count | |
469 | li r0,96 ; Stride for touch ahead | |
470 | b nxtline | |
471 | ||
472 | .align 4 | |
473 | nxtline: | |
474 | lwz r2,0(r6) ; Get the first word | |
475 | lwz r5,4(r6) ; Get the second word | |
476 | lwz r7,8(r6) ; Get the third word | |
477 | lwz r8,12(r6) ; Get the fourth word | |
478 | lwz r9,16(r6) ; Get the fifth word | |
479 | lwz r10,20(r6) ; Get the sixth word | |
480 | lwz r11,24(r6) ; Get the seventh word | |
481 | lwz r12,28(r6) ; Get the eighth word | |
1c79356b | 482 | bt- noncache,skipz ; Skip if we are not cached... |
91447636 A |
483 | dcbz 0,r4 ; Blow away the whole line because we are replacing it |
484 | dcbt r6,r0 ; Touch ahead a bit | |
485 | skipz: | |
1c79356b | 486 | addi r6,r6,32 ; Point to the next |
91447636 A |
487 | stw r2,0(r4) ; Save the first word |
488 | stw r5,4(r4) ; Save the second word | |
489 | stw r7,8(r4) ; Save the third word | |
490 | stw r8,12(r4) ; Save the fourth word | |
491 | stw r9,16(r4) ; Save the fifth word | |
492 | stw r10,20(r4) ; Save the sixth word | |
493 | stw r11,24(r4) ; Save the seventh word | |
494 | stw r12,28(r4) ; Save the eighth word | |
1c79356b | 495 | addi r4,r4,32 ; Bump sink |
91447636 | 496 | bdnz+ nxtline ; Do the next line, if any... |
1c79356b A |
497 | |
498 | ||
499 | ; Move backend quadword | |
500 | ||
91447636 A |
501 | backend: ; Join here from "shortcopy" for forward moves <32 bytes |
502 | bf 27,noquad ; No quad to do... | |
1c79356b A |
503 | lwz r7,0(r6) ; Get the first word |
504 | lwz r8,4(r6) ; Get the second word | |
505 | lwz r9,8(r6) ; Get the third word | |
506 | lwz r11,12(r6) ; Get the fourth word | |
507 | stw r7,0(r4) ; Save the first word | |
508 | addi r6,r6,16 ; Point to the next | |
509 | stw r8,4(r4) ; Save the second word | |
510 | stw r9,8(r4) ; Save the third word | |
511 | stw r11,12(r4) ; Save the fourth word | |
512 | addi r4,r4,16 ; Bump sink | |
513 | ||
514 | ; Move backend double | |
515 | ||
516 | noquad: bf 28,nodouble ; No double to do... | |
517 | lwz r7,0(r6) ; Get the first word | |
518 | lwz r8,4(r6) ; Get the second word | |
519 | addi r6,r6,8 ; Point to the next | |
520 | stw r7,0(r4) ; Save the first word | |
521 | stw r8,4(r4) ; Save the second word | |
522 | addi r4,r4,8 ; Bump sink | |
523 | ||
524 | ; Move backend word | |
525 | ||
526 | nodouble: bf 29,noword ; No word to do... | |
527 | lwz r7,0(r6) ; Get the word | |
528 | addi r6,r6,4 ; Point to the next | |
529 | stw r7,0(r4) ; Save the word | |
530 | addi r4,r4,4 ; Bump sink | |
531 | ||
532 | ; Move backend halfword | |
533 | ||
534 | noword: bf 30,nohalf ; No halfword to do... | |
535 | lhz r7,0(r6) ; Get the halfword | |
536 | addi r6,r6,2 ; Point to the next | |
537 | sth r7,0(r4) ; Save the halfword | |
538 | addi r4,r4,2 ; Bump sink | |
539 | ||
540 | ; Move backend byte | |
541 | ||
91447636 | 542 | nohalf: bflr 31 ; Leave cuz we are all done... |
1c79356b A |
543 | lbz r7,0(r6) ; Get the byte |
544 | stb r7,0(r4) ; Save the single | |
55e303ae A |
545 | blr |
546 | ||
9bccf70c | 547 | |
91447636 A |
548 | ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines. |
549 | ; NOTE: we never do an unaligned access if the source and destination are "relatively" | |
550 | ; word aligned. We depend on this in the uncached case on 64-bit processors. | |
551 | ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon. | |
552 | ; r4 = destination | |
553 | ; r5 = length (>0) | |
554 | ; r6 = source | |
555 | ; r8 = inverse of largest mask smaller than operand length | |
556 | ; cr5 = noncache flag (but we don't dcbz anyway) | |
1c79356b | 557 | |
91447636 A |
558 | reverse32bit: ; here from 64-bit code with word aligned uncached operands |
559 | add r4,r5,r4 ; Point past the last sink byte | |
1c79356b | 560 | add r6,r5,r6 ; Point past the last source byte |
91447636 A |
561 | rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary |
562 | li r12,-1 ; Make sure we touch in the actual line | |
563 | andc. r0,r7,r8 ; Apply movement limit | |
1c79356b | 564 | dcbt r12,r6 ; Touch in the last line of source |
91447636 | 565 | mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... |
1c79356b | 566 | dcbtst r12,r4 ; Touch in the last line of the sink |
91447636 A |
567 | mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 |
568 | beq- balline ; Aready on cache line boundary (or too short to bother) | |
1c79356b A |
569 | |
570 | sub r5,r5,r0 ; Precaculate move length left after alignment | |
571 | ||
572 | bf 31,balhalf ; No single byte to do... | |
573 | lbz r7,-1(r6) ; Get the byte | |
574 | subi r6,r6,1 ; Point to the next | |
575 | stb r7,-1(r4) ; Save the single | |
576 | subi r4,r4,1 ; Bump sink | |
577 | ||
578 | ; Sink is halfword aligned here | |
579 | ||
580 | balhalf: bf 30,balword ; No halfword to do... | |
581 | lhz r7,-2(r6) ; Get the halfword | |
582 | subi r6,r6,2 ; Point to the next | |
583 | sth r7,-2(r4) ; Save the halfword | |
584 | subi r4,r4,2 ; Bump sink | |
585 | ||
586 | ; Sink is word aligned here | |
587 | ||
588 | balword: bf 29,baldouble ; No word to do... | |
589 | lwz r7,-4(r6) ; Get the word | |
590 | subi r6,r6,4 ; Point to the next | |
591 | stw r7,-4(r4) ; Save the word | |
592 | subi r4,r4,4 ; Bump sink | |
593 | ||
594 | ; Sink is double aligned here | |
595 | ||
596 | baldouble: bf 28,balquad ; No double to do... | |
597 | lwz r7,-8(r6) ; Get the first word | |
598 | lwz r8,-4(r6) ; Get the second word | |
599 | subi r6,r6,8 ; Point to the next | |
600 | stw r7,-8(r4) ; Save the first word | |
601 | stw r8,-4(r4) ; Save the second word | |
602 | subi r4,r4,8 ; Bump sink | |
603 | ||
604 | ; Sink is quadword aligned here | |
605 | ||
606 | balquad: bf 27,balline ; No quad to do... | |
607 | lwz r7,-16(r6) ; Get the first word | |
608 | lwz r8,-12(r6) ; Get the second word | |
609 | lwz r9,-8(r6) ; Get the third word | |
610 | lwz r11,-4(r6) ; Get the fourth word | |
611 | stw r7,-16(r4) ; Save the first word | |
612 | subi r6,r6,16 ; Point to the next | |
613 | stw r8,-12(r4) ; Save the second word | |
614 | stw r9,-8(r4) ; Save the third word | |
615 | stw r11,-4(r4) ; Save the fourth word | |
616 | subi r4,r4,16 ; Bump sink | |
617 | ||
618 | ; Sink is line aligned here | |
619 | ||
620 | balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move | |
91447636 A |
621 | mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... |
622 | mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 | |
1c79356b | 623 | beq- bbackend ; No full lines to move |
91447636 A |
624 | mtctr r0 ; set up loop count |
625 | b bnxtline | |
1c79356b | 626 | |
91447636 A |
627 | .align 4 |
628 | bnxtline: | |
1c79356b A |
629 | lwz r7,-32(r6) ; Get the first word |
630 | lwz r5,-28(r6) ; Get the second word | |
631 | lwz r2,-24(r6) ; Get the third word | |
632 | lwz r12,-20(r6) ; Get the third word | |
633 | lwz r11,-16(r6) ; Get the fifth word | |
634 | lwz r10,-12(r6) ; Get the sixth word | |
635 | lwz r9,-8(r6) ; Get the seventh word | |
636 | lwz r8,-4(r6) ; Get the eighth word | |
637 | subi r6,r6,32 ; Point to the next | |
638 | ||
639 | stw r7,-32(r4) ; Get the first word | |
91447636 | 640 | stw r5,-28(r4) ; Get the second word |
1c79356b A |
641 | stw r2,-24(r4) ; Get the third word |
642 | stw r12,-20(r4) ; Get the third word | |
643 | stw r11,-16(r4) ; Get the fifth word | |
644 | stw r10,-12(r4) ; Get the sixth word | |
645 | stw r9,-8(r4) ; Get the seventh word | |
646 | stw r8,-4(r4) ; Get the eighth word | |
647 | subi r4,r4,32 ; Bump sink | |
648 | ||
91447636 | 649 | bdnz+ bnxtline ; Do the next line, if any... |
1c79356b A |
650 | |
651 | ; | |
652 | ; Note: We touched these lines in at the beginning | |
653 | ; | |
654 | ||
655 | ; Move backend quadword | |
656 | ||
91447636 A |
657 | bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes |
658 | bf 27,bnoquad ; No quad to do... | |
1c79356b A |
659 | lwz r7,-16(r6) ; Get the first word |
660 | lwz r8,-12(r6) ; Get the second word | |
661 | lwz r9,-8(r6) ; Get the third word | |
662 | lwz r11,-4(r6) ; Get the fourth word | |
663 | stw r7,-16(r4) ; Save the first word | |
664 | subi r6,r6,16 ; Point to the next | |
665 | stw r8,-12(r4) ; Save the second word | |
666 | stw r9,-8(r4) ; Save the third word | |
667 | stw r11,-4(r4) ; Save the fourth word | |
668 | subi r4,r4,16 ; Bump sink | |
669 | ||
670 | ; Move backend double | |
671 | ||
672 | bnoquad: bf 28,bnodouble ; No double to do... | |
673 | lwz r7,-8(r6) ; Get the first word | |
674 | lwz r8,-4(r6) ; Get the second word | |
675 | subi r6,r6,8 ; Point to the next | |
676 | stw r7,-8(r4) ; Save the first word | |
677 | stw r8,-4(r4) ; Save the second word | |
678 | subi r4,r4,8 ; Bump sink | |
679 | ||
680 | ; Move backend word | |
681 | ||
682 | bnodouble: bf 29,bnoword ; No word to do... | |
683 | lwz r7,-4(r6) ; Get the word | |
684 | subi r6,r6,4 ; Point to the next | |
685 | stw r7,-4(r4) ; Save the word | |
686 | subi r4,r4,4 ; Bump sink | |
687 | ||
688 | ; Move backend halfword | |
689 | ||
690 | bnoword: bf 30,bnohalf ; No halfword to do... | |
691 | lhz r7,-2(r6) ; Get the halfword | |
692 | subi r6,r6,2 ; Point to the next | |
693 | sth r7,-2(r4) ; Save the halfword | |
694 | subi r4,r4,2 ; Bump sink | |
695 | ||
696 | ; Move backend byte | |
697 | ||
91447636 | 698 | bnohalf: bflr 31 ; Leave cuz we are all done... |
1c79356b A |
699 | lbz r7,-1(r6) ; Get the byte |
700 | stb r7,-1(r4) ; Save the single | |
91447636 | 701 | blr |
55e303ae A |
702 | |
703 | ||
704 | // Here on 64-bit processors, which have a 128-byte cache line. This can be | |
705 | // called either in 32 or 64-bit mode, which makes the test for reverse moves | |
706 | // a little tricky. We've already filtered out the (sou==dest) and (len==0) | |
707 | // special cases. | |
708 | // | |
709 | // When entered: | |
710 | // r4 = destination (32 or 64-bit ptr) | |
711 | // r5 = length (always 32 bits) | |
712 | // r6 = source (32 or 64-bit ptr) | |
91447636 A |
713 | // r12 = (dest - source), reverse move required if (dest-source)<length |
714 | // cr5 = noncache flag | |
55e303ae A |
715 | |
716 | .align 5 | |
717 | copyit64: | |
91447636 A |
718 | rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode |
719 | cntlzw r11,r5 // get magnitude of length | |
55e303ae | 720 | dcbt 0,r6 // touch in 1st block of source |
55e303ae | 721 | dcbtst 0,r4 // touch in 1st destination cache block |
91447636 A |
722 | subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way |
723 | li r0,0 // get a 0 | |
724 | lis r10,hi16(0x80000000)// get 0x80000000 | |
725 | addze. r0,r0 // set cr0 on carry bit (beq if reverse move required) | |
726 | neg r9,r4 // start to get alignment for destination | |
727 | sraw r8,r10,r11 // get mask based on operand length, to limit alignment | |
728 | bt-- noncache,c64uncached// skip if uncached | |
729 | beq-- c64rdouble // handle cached reverse moves | |
730 | ||
55e303ae A |
731 | |
732 | // Forward, cached or doubleword aligned uncached. This is the common case. | |
91447636 A |
733 | // NOTE: we never do an unaligned access if the source and destination are "relatively" |
734 | // doubleword aligned. We depend on this in the uncached case. | |
735 | // r4 = destination | |
736 | // r5 = length (>0) | |
737 | // r6 = source | |
738 | // r8 = inverse of largest mask smaller than operand length | |
739 | // r9 = neg(dest), used to compute alignment | |
740 | // cr5 = noncache flag | |
55e303ae A |
741 | |
742 | c64double: | |
91447636 A |
743 | rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination |
744 | andc r7,r7,r8 // limit by operand length | |
55e303ae A |
745 | andi. r8,r7,7 // r8 <- #bytes to doubleword align |
746 | srwi r9,r7,3 // r9 <- #doublewords to 128-byte align | |
747 | sub r5,r5,r7 // adjust length remaining | |
748 | cmpwi cr1,r9,0 // any doublewords to move to cache align? | |
749 | srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest | |
750 | cmpwi cr7,r10,0 // set cr7 on chunk count | |
751 | beq c64double2 // dest already doubleword aligned | |
752 | mtctr r8 | |
753 | b c64double1 | |
754 | ||
755 | .align 5 // align inner loops | |
756 | c64double1: // copy bytes until dest is doubleword aligned | |
757 | lbz r0,0(r6) | |
758 | addi r6,r6,1 | |
759 | stb r0,0(r4) | |
760 | addi r4,r4,1 | |
761 | bdnz c64double1 | |
762 | ||
91447636 | 763 | c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks |
55e303ae A |
764 | beq cr1,c64double4 // no doublewords to xfer in order to cache align |
765 | mtctr r9 | |
766 | b c64double3 | |
767 | ||
768 | .align 5 // align inner loops | |
769 | c64double3: // copy doublewords until dest is 128-byte aligned | |
770 | ld r7,0(r6) | |
771 | addi r6,r6,8 | |
772 | std r7,0(r4) | |
773 | addi r4,r4,8 | |
774 | bdnz c64double3 | |
775 | ||
91447636 | 776 | // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for |
55e303ae A |
777 | // data (64 bytes), we load/store each twice per 128-byte chunk. |
778 | ||
779 | c64double4: // r10/cr7=128-byte chunks | |
780 | rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks | |
781 | cmpwi cr1,r0,0 // set cr1 on leftover doublewords | |
782 | beq cr7,c64double7 // no 128-byte chunks | |
55e303ae | 783 | |
91447636 A |
784 | ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes, |
785 | ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable. | |
55e303ae | 786 | |
91447636 A |
787 | sub r8,r6,r4 // r8 <- (source - dest) |
788 | rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent | |
789 | cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128 | |
790 | mtctr r10 | |
791 | b c64InnerLoop | |
792 | ||
55e303ae A |
793 | .align 5 // align inner loop |
794 | c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination | |
91447636 | 795 | ld r0,0(r6) // start pipe: load 1st half-line |
55e303ae | 796 | ld r2,8(r6) |
55e303ae | 797 | ld r7,16(r6) |
55e303ae | 798 | ld r8,24(r6) |
55e303ae | 799 | ld r9,32(r6) |
55e303ae | 800 | ld r10,40(r6) |
55e303ae | 801 | ld r11,48(r6) |
55e303ae | 802 | ld r12,56(r6) |
55e303ae A |
803 | bt noncache,c64InnerLoop1 // skip if uncached or overlap |
804 | dcbz128 0,r4 // avoid prefetch of next cache line | |
805 | c64InnerLoop1: | |
91447636 A |
806 | |
807 | std r0,0(r4) | |
55e303ae | 808 | std r2,8(r4) |
55e303ae | 809 | std r7,16(r4) |
55e303ae | 810 | std r8,24(r4) |
55e303ae | 811 | std r9,32(r4) |
55e303ae | 812 | std r10,40(r4) |
55e303ae | 813 | std r11,48(r4) |
55e303ae | 814 | std r12,56(r4) |
55e303ae | 815 | |
91447636 A |
816 | ld r0,64(r6) // load 2nd half of chunk |
817 | ld r2,72(r6) | |
818 | ld r7,80(r6) | |
819 | ld r8,88(r6) | |
820 | ld r9,96(r6) | |
821 | ld r10,104(r6) | |
822 | ld r11,112(r6) | |
823 | ld r12,120(r6) | |
824 | addi r6,r6,128 | |
825 | ||
826 | std r0,64(r4) | |
55e303ae A |
827 | std r2,72(r4) |
828 | std r7,80(r4) | |
829 | std r8,88(r4) | |
830 | std r9,96(r4) | |
831 | std r10,104(r4) | |
832 | std r11,112(r4) | |
833 | std r12,120(r4) | |
834 | addi r4,r4,128 // advance to next dest chunk | |
835 | ||
91447636 A |
836 | bdnz c64InnerLoop // loop if more chunks |
837 | ||
838 | ||
55e303ae A |
839 | c64double7: // r5 <- leftover bytes, cr1 set on doubleword count |
840 | rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15) | |
841 | andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7) | |
842 | beq cr1,c64byte // no leftover doublewords | |
843 | mtctr r0 | |
844 | b c64double8 | |
845 | ||
846 | .align 5 // align inner loop | |
847 | c64double8: // loop copying leftover doublewords | |
848 | ld r0,0(r6) | |
849 | addi r6,r6,8 | |
850 | std r0,0(r4) | |
851 | addi r4,r4,8 | |
852 | bdnz c64double8 | |
853 | ||
854 | ||
855 | // Forward byte loop. | |
856 | ||
857 | c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached) | |
91447636 | 858 | beqlr // done if no leftover bytes |
55e303ae A |
859 | mtctr r5 |
860 | b c64byte1 | |
861 | ||
862 | .align 5 // align inner loop | |
863 | c64byte1: | |
864 | lbz r0,0(r6) | |
865 | addi r6,r6,1 | |
866 | stb r0,0(r4) | |
867 | addi r4,r4,1 | |
868 | bdnz c64byte1 | |
869 | ||
91447636 | 870 | blr |
55e303ae A |
871 | |
872 | ||
873 | // Uncached copies. We must avoid unaligned accesses, since they always take alignment | |
874 | // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands | |
875 | // a byte at a time, but that is still much faster than alignment exceptions. | |
91447636 A |
876 | // r4 = destination |
877 | // r5 = length (>0) | |
878 | // r6 = source | |
879 | // r8 = inverse of largest mask smaller than operand length | |
880 | // r9 = neg(dest), used to compute alignment | |
881 | // r12 = (dest-source), used to test relative alignment | |
882 | // cr0 = beq if reverse move required | |
883 | // cr5 = noncache flag | |
55e303ae A |
884 | |
885 | c64uncached: | |
91447636 A |
886 | rlwinm r10,r12,0,29,31 // relatively doubleword aligned? |
887 | rlwinm r11,r12,0,30,31 // relatively word aligned? | |
888 | cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned | |
55e303ae | 889 | cmpwi cr1,r11,0 // set cr1 beq if word aligned |
91447636 A |
890 | beq-- c64reverseUncached |
891 | ||
892 | beq cr7,c64double // doubleword aligned | |
893 | beq cr1,forward32bit // word aligned, use G3/G4 code | |
55e303ae A |
894 | cmpwi r5,0 // set cr0 on byte count |
895 | b c64byte // unaligned operands | |
896 | ||
897 | c64reverseUncached: | |
91447636 A |
898 | beq cr7,c64rdouble // doubleword aligned so can use LD/STD |
899 | beq cr1,reverse32bit // word aligned, use G3/G4 code | |
55e303ae A |
900 | add r6,r6,r5 // point to (end+1) of source and dest |
901 | add r4,r4,r5 | |
902 | cmpwi r5,0 // set cr0 on length | |
903 | b c64rbyte // copy a byte at a time | |
904 | ||
905 | ||
906 | ||
907 | // Reverse doubleword copies. This is used for all cached copies, and doubleword | |
908 | // aligned uncached copies. | |
91447636 A |
909 | // r4 = destination |
910 | // r5 = length (>0) | |
911 | // r6 = source | |
912 | // r8 = inverse of largest mask of low-order 1s smaller than operand length | |
913 | // cr5 = noncache flag | |
55e303ae A |
914 | |
915 | c64rdouble: | |
916 | add r6,r6,r5 // point to (end+1) of source and dest | |
917 | add r4,r4,r5 | |
91447636 A |
918 | rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest |
919 | andc. r7,r7,r8 // limit by operand length | |
55e303ae A |
920 | sub r5,r5,r7 // adjust length |
921 | srwi r8,r5,6 // r8 <- 64-byte chunks to xfer | |
922 | cmpwi cr1,r8,0 // any chunks? | |
923 | beq c64rd2 // source already doubleword aligned | |
924 | mtctr r7 | |
925 | ||
926 | c64rd1: // copy bytes until source doublword aligned | |
927 | lbzu r0,-1(r6) | |
928 | stbu r0,-1(r4) | |
929 | bdnz c64rd1 | |
930 | ||
931 | c64rd2: // r8/cr1 <- count of 64-byte chunks | |
932 | rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords | |
933 | andi. r5,r5,7 // r5/cr0 <- count of leftover bytes | |
934 | cmpwi cr7,r0,0 // leftover doublewords? | |
935 | beq cr1,c64rd4 // no chunks to xfer | |
55e303ae | 936 | mtctr r8 |
55e303ae A |
937 | b c64rd3 |
938 | ||
939 | .align 5 // align inner loop | |
940 | c64rd3: // loop copying 64-byte chunks | |
941 | ld r7,-8(r6) | |
942 | ld r8,-16(r6) | |
943 | ld r9,-24(r6) | |
944 | ld r10,-32(r6) | |
945 | ld r11,-40(r6) | |
946 | ld r12,-48(r6) | |
947 | std r7,-8(r4) | |
948 | std r8,-16(r4) | |
949 | ld r7,-56(r6) | |
950 | ldu r8,-64(r6) | |
951 | std r9,-24(r4) | |
952 | std r10,-32(r4) | |
953 | std r11,-40(r4) | |
954 | std r12,-48(r4) | |
955 | std r7,-56(r4) | |
956 | stdu r8,-64(r4) | |
957 | bdnz c64rd3 | |
958 | ||
959 | c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes | |
960 | beq cr7,c64rbyte // no leftover doublewords | |
961 | mtctr r0 | |
962 | ||
963 | c64rd5: // loop copying leftover doublewords | |
964 | ldu r0,-8(r6) | |
965 | stdu r0,-8(r4) | |
966 | bdnz c64rd5 | |
967 | ||
968 | ||
969 | // Reverse byte loop. | |
970 | ||
971 | c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached) | |
91447636 | 972 | beqlr // done if no leftover bytes |
55e303ae A |
973 | mtctr r5 |
974 | ||
975 | c64rbyte1: | |
976 | lbzu r0,-1(r6) | |
977 | stbu r0,-1(r4) | |
978 | bdnz c64rbyte1 | |
979 | ||
91447636 | 980 | blr |
55e303ae | 981 |