]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
55e303ae | 2 | * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. |
1c79356b A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
e5568f75 A |
6 | * The contents of this file constitute Original Code as defined in and |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
1c79356b | 11 | * |
e5568f75 A |
12 | * This Original Code and all software distributed under the License are |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
1c79356b A |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
e5568f75 A |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
1c79356b A |
19 | * |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ; | |
23 | ; Copy bytes of data around. handles overlapped data. | |
24 | ; | |
25 | ; Change this to use Altivec later on, and maybe floating point. | |
26 | ; | |
1c79356b A |
27 | ; |
28 | #include <ppc/asm.h> | |
29 | #include <ppc/proc_reg.h> | |
55e303ae | 30 | #include <assym.s> |
1c79356b A |
31 | |
32 | ; Use CR5_lt to indicate non-cached | |
33 | #define noncache 20 | |
55e303ae | 34 | |
1c79356b A |
35 | ; Use CR5_gt to indicate that we need to turn data translation back on |
36 | #define fixxlate 21 | |
55e303ae A |
37 | |
38 | ; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off | |
39 | ; 64-bit mode (if 64-bit) before returning to our caller. We overload the | |
40 | ; bit to reduce the number of conditional branches at bcopy exit. | |
41 | #define restorex 22 | |
42 | ||
43 | ; Use CR5_so to indicate that we need to restore real-mode cachability | |
44 | ; Only needed on 64-bit machines | |
45 | #define flipcache 23 | |
1c79356b A |
46 | |
47 | ; | |
48 | ; bcopy_nc(from, to, nbytes) | |
49 | ; | |
50 | ; bcopy_nc operates on non-cached memory so we can not use any kind | |
51 | ; of cache instructions. | |
52 | ; | |
53 | ||
9bccf70c A |
54 | .align 5 |
55 | .globl EXT(bcopy_nc) | |
1c79356b | 56 | |
9bccf70c | 57 | LEXT(bcopy_nc) |
1c79356b A |
58 | |
59 | crset noncache ; Set non-cached | |
60 | b bcpswap | |
61 | ||
9bccf70c A |
62 | ; |
63 | ; void bcopy_physvir(from, to, nbytes) | |
64 | ; Attempt to copy physically addressed memory with translation on if conditions are met. | |
55e303ae A |
65 | ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors |
66 | ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs | |
67 | ; for the passed phys addrs and do the copy with translation on. | |
9bccf70c A |
68 | ; |
69 | ; Rules are: neither source nor destination can cross a page. | |
9bccf70c | 70 | ; |
55e303ae | 71 | ; Interrupts must be disabled throughout the copy when this is called. |
9bccf70c A |
72 | ; To do this, we build a |
73 | ; 128 DBAT for both the source and sink. If both are the same, only one is | |
74 | ; loaded. We do not touch the IBATs, so there is no issue if either physical page | |
75 | ; address is the same as the virtual address of the instructions we are executing. | |
76 | ; | |
55e303ae A |
77 | ; At the end, we invalidate the used DBATs. |
78 | ; | |
79 | ; Note that the address parameters are long longs. We will transform these to 64-bit | |
80 | ; values. Note that on 32-bit architectures that this will ignore the high half of the | |
81 | ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses | |
82 | ; there anyhow. | |
9bccf70c A |
83 | ; |
84 | ; Note, this one will not work in user state | |
85 | ; | |
86 | ||
87 | .align 5 | |
88 | .globl EXT(bcopy_physvir) | |
89 | ||
90 | LEXT(bcopy_physvir) | |
91 | ||
55e303ae A |
92 | crclr flipcache ; (HACK) No cache flip needed |
93 | mfsprg r8,2 ; get processor feature flags | |
94 | rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg | |
95 | addic. r0,r7,-1 ; Get length - 1 | |
96 | rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits | |
9bccf70c | 97 | add r11,r3,r0 ; Point to last byte of sink |
55e303ae A |
98 | rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg |
99 | mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test | |
100 | rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits | |
101 | mr r5,r7 ; Get the length into the right register | |
102 | cmplw cr1,r3,r4 ; Does source == sink? | |
103 | bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs) | |
9bccf70c A |
104 | add r12,r4,r0 ; Point to last byte of source |
105 | bltlr- ; Bail if length is 0 or way too big | |
106 | xor r7,r11,r3 ; See if we went to next page | |
107 | xor r8,r12,r4 ; See if we went to next page | |
108 | or r0,r7,r8 ; Combine wrap | |
109 | ||
55e303ae A |
110 | // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes |
111 | li r9,((2<<3)|2) ; Set default attributes | |
9bccf70c A |
112 | rlwinm. r0,r0,0,0,19 ; Did we overflow a page? |
113 | li r7,2 ; Set validity flags | |
114 | li r8,2 ; Set validity flags | |
55e303ae | 115 | bne- bcopy_phys1 ; Overflowed page, do normal physical copy... |
9bccf70c | 116 | |
55e303ae | 117 | crset restorex ; Remember to trash BATs on the way out |
9bccf70c A |
118 | rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value |
119 | rlwimi r12,r9,0,15,31 ; Set source lower DBAT value | |
120 | rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value | |
121 | rlwimi r8,r12,0,0,14 ; Set source upper DBAT value | |
122 | cmplw cr1,r11,r12 ; See if sink and source are same block | |
123 | ||
124 | sync | |
125 | ||
126 | mtdbatl 0,r11 ; Set sink lower DBAT | |
127 | mtdbatu 0,r7 ; Set sink upper DBAT | |
128 | ||
129 | beq- cr1,bcpvsame ; Source and sink are in same block | |
130 | ||
131 | mtdbatl 1,r12 ; Set source lower DBAT | |
132 | mtdbatu 1,r8 ; Set source upper DBAT | |
133 | ||
e5568f75 A |
134 | bcpvsame: |
135 | sync ; wait for BAT to stabilize | |
136 | isync | |
137 | mr r6,r3 ; Set source | |
9bccf70c | 138 | crclr noncache ; Set cached |
55e303ae | 139 | crclr fixxlate ; Set translation already ok |
9bccf70c | 140 | |
55e303ae | 141 | b copyit32 ; Go copy it... |
9bccf70c | 142 | |
1c79356b A |
143 | ; |
144 | ; void bcopy_phys(from, to, nbytes) | |
145 | ; Turns off data translation before the copy. Note, this one will | |
55e303ae A |
146 | ; not work in user state. This routine is used on 32 and 64-bit |
147 | ; machines. | |
148 | ; | |
149 | ; Note that the address parameters are long longs. We will transform these to 64-bit | |
150 | ; values. Note that on 32-bit architectures that this will ignore the high half of the | |
151 | ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses | |
152 | ; there anyhow. | |
153 | ; | |
154 | ; Also note that you probably will not be happy if either the sink or source spans across the | |
155 | ; boundary between RAM and I/O space. Good chance of hanging the machine and this code | |
156 | ; will not check, so be careful. | |
1c79356b A |
157 | ; |
158 | ||
9bccf70c A |
159 | .align 5 |
160 | .globl EXT(bcopy_phys) | |
161 | ||
162 | LEXT(bcopy_phys) | |
55e303ae A |
163 | crclr flipcache ; (HACK) No cache flip needed |
164 | rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg | |
165 | mfsprg r8,2 ; get processor feature flags | |
166 | rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits | |
167 | rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg | |
168 | mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test | |
169 | rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits | |
170 | mr r5,r7 ; Get the length into the right register | |
171 | ||
172 | bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6 | |
1c79356b A |
173 | mfmsr r9 ; Get the MSR |
174 | crclr noncache ; Set cached | |
55e303ae A |
175 | bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) |
176 | ||
177 | ; 32-bit CPUs | |
178 | ||
179 | sub. r0,r3,r4 ; to==from? | |
180 | rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on? | |
181 | cmpwi cr1,r8,0 ; set cr1 beq if translation was off | |
182 | oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable | |
1c79356b | 183 | cmplwi cr7,r5,0 ; Check if we have a 0 length |
55e303ae A |
184 | beqlr- ; bail if to==from |
185 | ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP | |
1c79356b | 186 | mr r6,r3 ; Set source |
55e303ae | 187 | andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC |
1c79356b A |
188 | beqlr- cr7 ; Bail if length is 0 |
189 | ||
55e303ae | 190 | crclr restorex ; Make sure we do not trash BATs on the way out |
1c79356b A |
191 | mtmsr r9 ; Set DR translation off |
192 | isync ; Wait for it | |
193 | ||
55e303ae A |
194 | crnot fixxlate,cr1_eq ; Remember to turn on translation if it was |
195 | b copyit32 ; Go copy it... | |
196 | ||
197 | ; 64-bit: turn DR off and SF on, remember if we need to restore on way out. | |
198 | ||
199 | bcopy_phys64: ; r9 = MSR | |
200 | ||
201 | srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory | |
202 | srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called | |
203 | rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified | |
204 | cmpld cr1,r3,r4 ; to==from? | |
205 | li r0,1 ; Note - we use this in a couple places below | |
206 | lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable | |
207 | cmpwi cr7,r5,0 ; length==0 ? | |
208 | ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR | |
209 | beqlr-- cr1 ; bail if to==from | |
210 | srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory | |
211 | rldimi r9,r0,63,MSR_SF_BIT ; set SF on | |
212 | beqlr-- cr7 ; bail if length==0 | |
213 | andc r9,r9,r6 ; turn DR, VEC, FP off | |
214 | cmpwi cr1,r8,0 ; was DR on? | |
215 | crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return | |
216 | mtmsrd r9 ; turn 64-bit addressing on, data translation off | |
217 | cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory? | |
218 | isync ; wait for it to happen | |
219 | mr r6,r3 ; Set source | |
220 | cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory? | |
221 | crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return | |
222 | ||
223 | cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area | |
224 | ||
225 | rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit | |
226 | sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit | |
227 | sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching | |
228 | bf++ flipcache,copyit64 ; (HACK) No need to mess with caching... | |
229 | ||
230 | ; | |
231 | ; HACK GLORIOUS HACK - when we force of caching, we need to also force off | |
232 | ; interruptions. We are out of CR bits, so we need to stash the entry EE | |
233 | ; somewheres. It is in the XER.... We NEED to change this!!!! | |
234 | ; | |
235 | ||
236 | mtxer r10 ; (HACK GLORIOUS HACK) Remember EE | |
237 | andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit | |
238 | mfspr r2,hid4 ; (HACK) Get HID4 | |
239 | crset noncache ; (HACK) Set non-cached | |
240 | mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE | |
241 | or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited | |
242 | sync ; (HACK) Sync up | |
243 | li r0,1 | |
244 | mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited | |
245 | isync ; (HACK) Toss prefetches | |
246 | ||
247 | lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible | |
248 | srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000 | |
249 | slbie r12 ; (HACK) Make sure the ERAT is cleared | |
250 | ||
251 | sync ; (HACK) | |
252 | isync ; (HACK) | |
253 | ||
254 | b copyit64 | |
255 | ||
1c79356b A |
256 | |
257 | ; | |
258 | ; void bcopy(from, to, nbytes) | |
259 | ; | |
260 | ||
9bccf70c A |
261 | .align 5 |
262 | .globl EXT(bcopy) | |
263 | ||
264 | LEXT(bcopy) | |
1c79356b A |
265 | |
266 | crclr noncache ; Set cached | |
267 | ||
55e303ae A |
268 | bcpswap: |
269 | crclr flipcache ; (HACK) No cache flip needed | |
270 | mfsprg r8,2 ; get processor feature flags | |
271 | sub. r0,r4,r3 ; test for to==from in mode-independent way | |
272 | mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test | |
273 | cmpwi cr1,r5,0 ; Check if we have a 0 length | |
274 | crclr restorex ; Make sure we do not trash BATs on the way out | |
1c79356b | 275 | mr r6,r3 ; Set source |
1c79356b | 276 | crclr fixxlate ; Set translation already ok |
55e303ae A |
277 | beqlr- ; Bail if "to" and "from" are the same |
278 | beqlr- cr1 ; Bail if length is 0 | |
279 | bt++ pf64Bitb,copyit64 ; handle 64-bit processor | |
280 | b copyit32 ; Go copy it... | |
1c79356b A |
281 | |
282 | ; | |
283 | ; When we move the memory, forward overlays must be handled. We | |
284 | ; also can not use the cache instructions if we are from bcopy_nc. | |
285 | ; We need to preserve R3 because it needs to be returned for memcpy. | |
286 | ; We can be interrupted and lose control here. | |
287 | ; | |
55e303ae A |
288 | ; There is no stack, so in order to use vectors, we would |
289 | ; need to take the vector exception. Any potential gains by using vectors | |
1c79356b A |
290 | ; would be more than eaten up by this. |
291 | ; | |
55e303ae A |
292 | ; NOTE: this code is called in three "modes": |
293 | ; - on 32-bit processors (32-byte cache line) | |
294 | ; - on 64-bit processors running in 32-bit mode (128-byte cache line) | |
295 | ; - on 64-bit processors running in 64-bit mode (128-byte cache line) | |
296 | ; | |
297 | ; ALSO NOTE: bcopy is called from copyin and copyout etc | |
298 | ; with the "thread_recover" ptr set. This means bcopy must not set up a | |
299 | ; stack frame or touch non-volatile registers, and also means that it | |
300 | ; cannot rely on turning off interrupts, because we expect to get DSIs | |
301 | ; and have execution aborted by a "longjmp" to the thread_recover | |
302 | ; routine. | |
1c79356b A |
303 | ; |
304 | ||
9bccf70c A |
305 | .align 5 |
306 | .globl EXT(memcpy) | |
55e303ae A |
307 | ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit |
308 | ; processors... | |
9bccf70c | 309 | LEXT(memcpy) |
55e303ae A |
310 | crclr flipcache ; (HACK) No cache flip needed |
311 | mfsprg r8,2 ; get processor feature flags | |
1c79356b | 312 | cmplw cr1,r3,r4 ; "to" and "from" the same? |
55e303ae | 313 | mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test |
1c79356b A |
314 | mr r6,r4 ; Set the "from" |
315 | mr. r5,r5 ; Length zero? | |
316 | crclr noncache ; Set cached | |
317 | mr r4,r3 ; Set the "to" | |
318 | crclr fixxlate ; Set translation already ok | |
319 | beqlr- cr1 ; "to" and "from" are the same | |
320 | beqlr- ; Length is 0 | |
55e303ae A |
321 | crclr restorex ; Make sure we do not trash BATs on the way out |
322 | bt++ pf64Bitb,copyit64 ; handle 64-bit processors | |
1c79356b | 323 | |
55e303ae | 324 | copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move) |
1c79356b A |
325 | lis r8,0x7FFF ; Start up a mask |
326 | srawi r11,r12,31 ; Propagate the sign bit | |
327 | dcbt br0,r6 ; Touch in the first source line | |
328 | cntlzw r7,r5 ; Get the highest power of 2 factor of the length | |
329 | ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF | |
330 | xor r9,r12,r11 ; If sink - source was negative, invert bits | |
331 | srw r8,r8,r7 ; Get move length limitation | |
332 | sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value | |
333 | cmplw r12,r5 ; See if we actually forward overlap | |
334 | cmplwi cr7,r9,32 ; See if at least a line between source and sink | |
335 | dcbtst br0,r4 ; Touch in the first sink line | |
336 | cmplwi cr1,r5,32 ; Are we moving more than a line? | |
55e303ae | 337 | cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space |
1c79356b A |
338 | blt- fwdovrlap ; This is a forward overlapping area, handle it... |
339 | ||
340 | ; | |
341 | ; R4 = sink | |
342 | ; R5 = length | |
343 | ; R6 = source | |
344 | ; | |
345 | ||
346 | ; | |
347 | ; Here we figure out how much we have to move to get the sink onto a | |
348 | ; cache boundary. If we can, and there are still more that 32 bytes | |
349 | ; left to move, we can really speed things up by DCBZing the sink line. | |
350 | ; We can not do this if noncache is set because we will take an | |
351 | ; alignment exception. | |
352 | ||
55e303ae | 353 | G4word: ; enter from 64-bit case with word aligned uncached operands |
1c79356b A |
354 | neg r0,r4 ; Get the number of bytes to move to align to a line boundary |
355 | rlwinm. r0,r0,0,27,31 ; Clean it up and test it | |
356 | and r0,r0,r8 ; limit to the maximum front end move | |
357 | mtcrf 3,r0 ; Make branch mask for partial moves | |
358 | sub r5,r5,r0 ; Set the length left to move | |
359 | beq alline ; Already on a line... | |
360 | ||
361 | bf 31,alhalf ; No single byte to do... | |
362 | lbz r7,0(r6) ; Get the byte | |
363 | addi r6,r6,1 ; Point to the next | |
364 | stb r7,0(r4) ; Save the single | |
365 | addi r4,r4,1 ; Bump sink | |
366 | ||
367 | ; Sink is halfword aligned here | |
368 | ||
369 | alhalf: bf 30,alword ; No halfword to do... | |
370 | lhz r7,0(r6) ; Get the halfword | |
371 | addi r6,r6,2 ; Point to the next | |
372 | sth r7,0(r4) ; Save the halfword | |
373 | addi r4,r4,2 ; Bump sink | |
374 | ||
375 | ; Sink is word aligned here | |
376 | ||
377 | alword: bf 29,aldouble ; No word to do... | |
378 | lwz r7,0(r6) ; Get the word | |
379 | addi r6,r6,4 ; Point to the next | |
380 | stw r7,0(r4) ; Save the word | |
381 | addi r4,r4,4 ; Bump sink | |
382 | ||
383 | ; Sink is double aligned here | |
384 | ||
385 | aldouble: bf 28,alquad ; No double to do... | |
386 | lwz r7,0(r6) ; Get the first word | |
387 | lwz r8,4(r6) ; Get the second word | |
388 | addi r6,r6,8 ; Point to the next | |
389 | stw r7,0(r4) ; Save the first word | |
390 | stw r8,4(r4) ; Save the second word | |
391 | addi r4,r4,8 ; Bump sink | |
392 | ||
393 | ; Sink is quadword aligned here | |
394 | ||
395 | alquad: bf 27,alline ; No quad to do... | |
396 | lwz r7,0(r6) ; Get the first word | |
397 | lwz r8,4(r6) ; Get the second word | |
398 | lwz r9,8(r6) ; Get the third word | |
399 | stw r7,0(r4) ; Save the first word | |
400 | lwz r11,12(r6) ; Get the fourth word | |
401 | addi r6,r6,16 ; Point to the next | |
402 | stw r8,4(r4) ; Save the second word | |
403 | stw r9,8(r4) ; Save the third word | |
404 | stw r11,12(r4) ; Save the fourth word | |
405 | addi r4,r4,16 ; Bump sink | |
406 | ||
407 | ; Sink is line aligned here | |
408 | ||
409 | alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move | |
410 | mtcrf 3,r5 ; Make branch mask for backend partial moves | |
411 | rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move | |
412 | beq- backend ; No full lines to move | |
413 | ||
414 | sub r5,r5,r11 ; Calculate the residual | |
415 | li r10,96 ; Stride for touch ahead | |
416 | ||
417 | nxtline: subic. r0,r0,1 ; Account for the line now | |
418 | ||
419 | bt- noncache,skipz ; Skip if we are not cached... | |
420 | dcbz br0,r4 ; Blow away the whole line because we are replacing it | |
421 | dcbt r6,r10 ; Touch ahead a bit | |
422 | ||
423 | skipz: lwz r7,0(r6) ; Get the first word | |
424 | lwz r8,4(r6) ; Get the second word | |
425 | lwz r9,8(r6) ; Get the third word | |
426 | stw r7,0(r4) ; Save the first word | |
427 | lwz r11,12(r6) ; Get the fourth word | |
428 | stw r8,4(r4) ; Save the second word | |
429 | lwz r7,16(r6) ; Get the fifth word | |
430 | stw r9,8(r4) ; Save the third word | |
431 | lwz r8,20(r6) ; Get the sixth word | |
432 | stw r11,12(r4) ; Save the fourth word | |
433 | lwz r9,24(r6) ; Get the seventh word | |
434 | stw r7,16(r4) ; Save the fifth word | |
435 | lwz r11,28(r6) ; Get the eighth word | |
436 | addi r6,r6,32 ; Point to the next | |
437 | stw r8,20(r4) ; Save the sixth word | |
438 | stw r9,24(r4) ; Save the seventh word | |
439 | stw r11,28(r4) ; Save the eighth word | |
440 | addi r4,r4,32 ; Bump sink | |
441 | bgt+ nxtline ; Do the next line, if any... | |
442 | ||
443 | ||
444 | ; Move backend quadword | |
445 | ||
446 | backend: bf 27,noquad ; No quad to do... | |
447 | lwz r7,0(r6) ; Get the first word | |
448 | lwz r8,4(r6) ; Get the second word | |
449 | lwz r9,8(r6) ; Get the third word | |
450 | lwz r11,12(r6) ; Get the fourth word | |
451 | stw r7,0(r4) ; Save the first word | |
452 | addi r6,r6,16 ; Point to the next | |
453 | stw r8,4(r4) ; Save the second word | |
454 | stw r9,8(r4) ; Save the third word | |
455 | stw r11,12(r4) ; Save the fourth word | |
456 | addi r4,r4,16 ; Bump sink | |
457 | ||
458 | ; Move backend double | |
459 | ||
460 | noquad: bf 28,nodouble ; No double to do... | |
461 | lwz r7,0(r6) ; Get the first word | |
462 | lwz r8,4(r6) ; Get the second word | |
463 | addi r6,r6,8 ; Point to the next | |
464 | stw r7,0(r4) ; Save the first word | |
465 | stw r8,4(r4) ; Save the second word | |
466 | addi r4,r4,8 ; Bump sink | |
467 | ||
468 | ; Move backend word | |
469 | ||
470 | nodouble: bf 29,noword ; No word to do... | |
471 | lwz r7,0(r6) ; Get the word | |
472 | addi r6,r6,4 ; Point to the next | |
473 | stw r7,0(r4) ; Save the word | |
474 | addi r4,r4,4 ; Bump sink | |
475 | ||
476 | ; Move backend halfword | |
477 | ||
478 | noword: bf 30,nohalf ; No halfword to do... | |
479 | lhz r7,0(r6) ; Get the halfword | |
480 | addi r6,r6,2 ; Point to the next | |
481 | sth r7,0(r4) ; Save the halfword | |
482 | addi r4,r4,2 ; Bump sink | |
483 | ||
484 | ; Move backend byte | |
485 | ||
486 | nohalf: bf 31,bcpydone ; Leave cuz we are all done... | |
487 | lbz r7,0(r6) ; Get the byte | |
488 | stb r7,0(r4) ; Save the single | |
489 | ||
55e303ae | 490 | bcpydone: |
1c79356b | 491 | mfmsr r9 ; Get the MSR |
55e303ae A |
492 | bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching... |
493 | ||
494 | li r0,1 ; (HACK) Get a 1 | |
495 | mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE | |
496 | sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching | |
497 | mfspr r2,hid4 ; (HACK) Get HID4 | |
498 | rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit | |
499 | andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited | |
500 | or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR | |
501 | sync ; (HACK) Sync up | |
502 | mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited | |
503 | isync ; (HACK) Toss prefetches | |
504 | ||
505 | lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible | |
506 | srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000 | |
507 | slbie r12 ; (HACK) Make sure the ERAT is cleared | |
508 | ||
509 | mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly | |
510 | ||
511 | bcpydone0: | |
512 | lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit | |
513 | ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit | |
514 | bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation... | |
1c79356b | 515 | ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on |
55e303ae | 516 | andc r9,r9,r0 ; Make sure that FP and VEC are off |
1c79356b A |
517 | mtmsr r9 ; Just do it |
518 | isync ; Hang in there | |
55e303ae A |
519 | |
520 | bcpydone1: | |
521 | bflr++ restorex ; done if we do not have to fix up addressing | |
522 | mfsprg r8,2 ; get the feature flags again | |
523 | mtcrf 0x02,r8 ; put pf64Bit where we can test it | |
524 | bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor | |
525 | ||
526 | ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir | |
527 | ||
528 | li r0,0 ; Get set to invalidate upper half | |
9bccf70c A |
529 | sync ; Make sure all is well |
530 | mtdbatu 0,r0 ; Clear sink upper DBAT | |
531 | mtdbatu 1,r0 ; Clear source upper DBAT | |
532 | sync | |
533 | isync | |
534 | blr | |
535 | ||
55e303ae A |
536 | ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys |
537 | ||
538 | bcpydone2: | |
539 | mfmsr r9 ; get MSR again | |
540 | andc r9,r9,r0 ; Make sure that FP and VEC are off | |
541 | rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF | |
542 | mtmsrd r9 | |
543 | isync | |
544 | blr | |
545 | ||
9bccf70c | 546 | |
1c79356b A |
547 | ; |
548 | ; 0123456789ABCDEF0123456789ABCDEF | |
549 | ; 0123456789ABCDEF0123456789ABCDEF | |
550 | ; F | |
551 | ; DE | |
552 | ; 9ABC | |
553 | ; 12345678 | |
554 | ; 123456789ABCDEF0 | |
555 | ; 0 | |
556 | ||
557 | ; | |
558 | ; Here is where we handle a forward overlapping move. These will be slow | |
559 | ; because we can not kill the cache of the destination until after we have | |
560 | ; loaded/saved the source area. Also, because reading memory backwards is | |
561 | ; slower when the cache line needs to be loaded because the critical | |
562 | ; doubleword is loaded first, i.e., the last, then it goes back to the first, | |
563 | ; and on in order. That means that when we are at the second to last DW we | |
564 | ; have to wait until the whole line is in cache before we can proceed. | |
565 | ; | |
55e303ae A |
566 | |
567 | G4reverseWord: ; here from 64-bit code with word aligned uncached operands | |
1c79356b A |
568 | fwdovrlap: add r4,r5,r4 ; Point past the last sink byte |
569 | add r6,r5,r6 ; Point past the last source byte | |
570 | and r0,r4,r8 ; Apply movement limit | |
571 | li r12,-1 ; Make sure we touch in the actual line | |
572 | mtcrf 3,r0 ; Figure out the best way to move backwards | |
573 | dcbt r12,r6 ; Touch in the last line of source | |
574 | rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary | |
575 | dcbtst r12,r4 ; Touch in the last line of the sink | |
576 | beq- balline ; Aready on cache line boundary | |
577 | ||
578 | sub r5,r5,r0 ; Precaculate move length left after alignment | |
579 | ||
580 | bf 31,balhalf ; No single byte to do... | |
581 | lbz r7,-1(r6) ; Get the byte | |
582 | subi r6,r6,1 ; Point to the next | |
583 | stb r7,-1(r4) ; Save the single | |
584 | subi r4,r4,1 ; Bump sink | |
585 | ||
586 | ; Sink is halfword aligned here | |
587 | ||
588 | balhalf: bf 30,balword ; No halfword to do... | |
589 | lhz r7,-2(r6) ; Get the halfword | |
590 | subi r6,r6,2 ; Point to the next | |
591 | sth r7,-2(r4) ; Save the halfword | |
592 | subi r4,r4,2 ; Bump sink | |
593 | ||
594 | ; Sink is word aligned here | |
595 | ||
596 | balword: bf 29,baldouble ; No word to do... | |
597 | lwz r7,-4(r6) ; Get the word | |
598 | subi r6,r6,4 ; Point to the next | |
599 | stw r7,-4(r4) ; Save the word | |
600 | subi r4,r4,4 ; Bump sink | |
601 | ||
602 | ; Sink is double aligned here | |
603 | ||
604 | baldouble: bf 28,balquad ; No double to do... | |
605 | lwz r7,-8(r6) ; Get the first word | |
606 | lwz r8,-4(r6) ; Get the second word | |
607 | subi r6,r6,8 ; Point to the next | |
608 | stw r7,-8(r4) ; Save the first word | |
609 | stw r8,-4(r4) ; Save the second word | |
610 | subi r4,r4,8 ; Bump sink | |
611 | ||
612 | ; Sink is quadword aligned here | |
613 | ||
614 | balquad: bf 27,balline ; No quad to do... | |
615 | lwz r7,-16(r6) ; Get the first word | |
616 | lwz r8,-12(r6) ; Get the second word | |
617 | lwz r9,-8(r6) ; Get the third word | |
618 | lwz r11,-4(r6) ; Get the fourth word | |
619 | stw r7,-16(r4) ; Save the first word | |
620 | subi r6,r6,16 ; Point to the next | |
621 | stw r8,-12(r4) ; Save the second word | |
622 | stw r9,-8(r4) ; Save the third word | |
623 | stw r11,-4(r4) ; Save the fourth word | |
624 | subi r4,r4,16 ; Bump sink | |
625 | ||
626 | ; Sink is line aligned here | |
627 | ||
628 | balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move | |
629 | mtcrf 3,r5 ; Make branch mask for backend partial moves | |
630 | beq- bbackend ; No full lines to move | |
1c79356b A |
631 | |
632 | ||
633 | ; Registers in use: R0, R1, R3, R4, R5, R6 | |
634 | ; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them | |
635 | ||
636 | bnxtline: subic. r0,r0,1 ; Account for the line now | |
637 | ||
638 | lwz r7,-32(r6) ; Get the first word | |
639 | lwz r5,-28(r6) ; Get the second word | |
640 | lwz r2,-24(r6) ; Get the third word | |
641 | lwz r12,-20(r6) ; Get the third word | |
642 | lwz r11,-16(r6) ; Get the fifth word | |
643 | lwz r10,-12(r6) ; Get the sixth word | |
644 | lwz r9,-8(r6) ; Get the seventh word | |
645 | lwz r8,-4(r6) ; Get the eighth word | |
646 | subi r6,r6,32 ; Point to the next | |
647 | ||
648 | stw r7,-32(r4) ; Get the first word | |
649 | ble- bnotouch ; Last time, skip touch of source... | |
650 | dcbt br0,r6 ; Touch in next source line | |
651 | ||
652 | bnotouch: stw r5,-28(r4) ; Get the second word | |
653 | stw r2,-24(r4) ; Get the third word | |
654 | stw r12,-20(r4) ; Get the third word | |
655 | stw r11,-16(r4) ; Get the fifth word | |
656 | stw r10,-12(r4) ; Get the sixth word | |
657 | stw r9,-8(r4) ; Get the seventh word | |
658 | stw r8,-4(r4) ; Get the eighth word | |
659 | subi r4,r4,32 ; Bump sink | |
660 | ||
661 | bgt+ bnxtline ; Do the next line, if any... | |
1c79356b A |
662 | |
663 | ; | |
664 | ; Note: We touched these lines in at the beginning | |
665 | ; | |
666 | ||
667 | ; Move backend quadword | |
668 | ||
669 | bbackend: bf 27,bnoquad ; No quad to do... | |
670 | lwz r7,-16(r6) ; Get the first word | |
671 | lwz r8,-12(r6) ; Get the second word | |
672 | lwz r9,-8(r6) ; Get the third word | |
673 | lwz r11,-4(r6) ; Get the fourth word | |
674 | stw r7,-16(r4) ; Save the first word | |
675 | subi r6,r6,16 ; Point to the next | |
676 | stw r8,-12(r4) ; Save the second word | |
677 | stw r9,-8(r4) ; Save the third word | |
678 | stw r11,-4(r4) ; Save the fourth word | |
679 | subi r4,r4,16 ; Bump sink | |
680 | ||
681 | ; Move backend double | |
682 | ||
683 | bnoquad: bf 28,bnodouble ; No double to do... | |
684 | lwz r7,-8(r6) ; Get the first word | |
685 | lwz r8,-4(r6) ; Get the second word | |
686 | subi r6,r6,8 ; Point to the next | |
687 | stw r7,-8(r4) ; Save the first word | |
688 | stw r8,-4(r4) ; Save the second word | |
689 | subi r4,r4,8 ; Bump sink | |
690 | ||
691 | ; Move backend word | |
692 | ||
693 | bnodouble: bf 29,bnoword ; No word to do... | |
694 | lwz r7,-4(r6) ; Get the word | |
695 | subi r6,r6,4 ; Point to the next | |
696 | stw r7,-4(r4) ; Save the word | |
697 | subi r4,r4,4 ; Bump sink | |
698 | ||
699 | ; Move backend halfword | |
700 | ||
701 | bnoword: bf 30,bnohalf ; No halfword to do... | |
702 | lhz r7,-2(r6) ; Get the halfword | |
703 | subi r6,r6,2 ; Point to the next | |
704 | sth r7,-2(r4) ; Save the halfword | |
705 | subi r4,r4,2 ; Bump sink | |
706 | ||
707 | ; Move backend byte | |
708 | ||
55e303ae | 709 | bnohalf: bf 31,bcpydone ; Leave cuz we are all done... |
1c79356b A |
710 | lbz r7,-1(r6) ; Get the byte |
711 | stb r7,-1(r4) ; Save the single | |
712 | ||
9bccf70c | 713 | b bcpydone ; Go exit cuz we are all done... |
55e303ae A |
714 | |
715 | ||
716 | // Here on 64-bit processors, which have a 128-byte cache line. This can be | |
717 | // called either in 32 or 64-bit mode, which makes the test for reverse moves | |
718 | // a little tricky. We've already filtered out the (sou==dest) and (len==0) | |
719 | // special cases. | |
720 | // | |
721 | // When entered: | |
722 | // r4 = destination (32 or 64-bit ptr) | |
723 | // r5 = length (always 32 bits) | |
724 | // r6 = source (32 or 64-bit ptr) | |
725 | // cr5 = noncache, fixxlate, flipcache, and restorex flags set | |
726 | ||
727 | .align 5 | |
728 | copyit64: | |
729 | lis r2,0x4000 // r2 = 0x00000000 40000000 | |
730 | neg r12,r4 // start to compute #bytes to align dest | |
731 | bt-- noncache,noncache1 // (HACK) Do not even try anything cached... | |
732 | dcbt 0,r6 // touch in 1st block of source | |
733 | noncache1: | |
734 | add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode | |
735 | cntlzw r9,r5 // get highest power-of-2 in length | |
736 | rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest | |
737 | bt-- noncache,noncache2 // (HACK) Do not even try anything cached... | |
738 | dcbtst 0,r4 // touch in 1st destination cache block | |
739 | noncache2: | |
740 | sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit | |
741 | bge copyit64a // skip if we are running in 64-bit mode | |
742 | rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits | |
743 | rlwinm r5,r5,0,0,31 | |
744 | rlwinm r6,r6,0,0,31 | |
745 | copyit64a: // now we can use 64-bit compares even if running in 32-bit mode | |
746 | sub r8,r4,r6 // get (dest-source) | |
747 | andc r7,r7,r2 // limit bytes to align by operand length | |
748 | cmpld cr1,r8,r5 // if (dest-source)<length, must move reverse | |
749 | bt-- noncache,c64uncached // skip if uncached | |
750 | blt-- cr1,c64rdouble // handle cached reverse moves | |
751 | ||
752 | ||
753 | // Forward, cached or doubleword aligned uncached. This is the common case. | |
754 | // r4-r6 = dest, length, source (as above) | |
755 | // r7 = #bytes 128-byte align dest (limited by copy length) | |
756 | // cr5 = flags, as above | |
757 | ||
758 | c64double: | |
759 | andi. r8,r7,7 // r8 <- #bytes to doubleword align | |
760 | srwi r9,r7,3 // r9 <- #doublewords to 128-byte align | |
761 | sub r5,r5,r7 // adjust length remaining | |
762 | cmpwi cr1,r9,0 // any doublewords to move to cache align? | |
763 | srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest | |
764 | cmpwi cr7,r10,0 // set cr7 on chunk count | |
765 | beq c64double2 // dest already doubleword aligned | |
766 | mtctr r8 | |
767 | b c64double1 | |
768 | ||
769 | .align 5 // align inner loops | |
770 | c64double1: // copy bytes until dest is doubleword aligned | |
771 | lbz r0,0(r6) | |
772 | addi r6,r6,1 | |
773 | stb r0,0(r4) | |
774 | addi r4,r4,1 | |
775 | bdnz c64double1 | |
776 | ||
777 | c64double2: // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0 | |
778 | beq cr1,c64double4 // no doublewords to xfer in order to cache align | |
779 | mtctr r9 | |
780 | b c64double3 | |
781 | ||
782 | .align 5 // align inner loops | |
783 | c64double3: // copy doublewords until dest is 128-byte aligned | |
784 | ld r7,0(r6) | |
785 | addi r6,r6,8 | |
786 | std r7,0(r4) | |
787 | addi r4,r4,8 | |
788 | bdnz c64double3 | |
789 | ||
790 | // Here to xfer 128-byte chunks, if any. Because the IBM 970 cannot issue two stores/cycle, | |
791 | // we pipeline the inner loop so we can pair loads and stores. Since we only have 8 GPRs for | |
792 | // data (64 bytes), we load/store each twice per 128-byte chunk. | |
793 | ||
794 | c64double4: // r10/cr7=128-byte chunks | |
795 | rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks | |
796 | cmpwi cr1,r0,0 // set cr1 on leftover doublewords | |
797 | beq cr7,c64double7 // no 128-byte chunks | |
798 | sub r8,r6,r4 // r8 <- (source - dest) | |
799 | li r9,128 // start at next cache line (we've already touched in 1st line) | |
800 | cmpldi cr7,r8,128 // if (source-dest)<128, cannot use dcbz128 beacause of overlap | |
801 | cror noncache,cr7_lt,noncache // turn on "noncache" flag if (source-dest)<128 | |
802 | bt-- noncache,noncache3 // (HACK) Skip cache touch if noncachable | |
803 | dcbt128 r9,r6,1 // start forward stream | |
804 | noncache3: | |
805 | mtctr r10 | |
806 | ||
807 | ld r0,0(r6) // start pipe: load 1st half-line | |
808 | ld r2,8(r6) | |
809 | ld r7,16(r6) | |
810 | ld r8,24(r6) | |
811 | ld r9,32(r6) | |
812 | ld r10,40(r6) | |
813 | ld r11,48(r6) | |
814 | ld r12,56(r6) | |
815 | b c64InnerLoopEntryPt | |
816 | ||
817 | .align 5 // align inner loop | |
818 | c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination | |
819 | std r0,64(r4) // store 2nd half of chunk n | |
820 | ld r0,0(r6) // load 1st half of chunk n+1 | |
821 | std r2,72(r4) | |
822 | ld r2,8(r6) | |
823 | std r7,80(r4) | |
824 | ld r7,16(r6) | |
825 | std r8,88(r4) | |
826 | ld r8,24(r6) | |
827 | std r9,96(r4) | |
828 | ld r9,32(r6) | |
829 | std r10,104(r4) | |
830 | ld r10,40(r6) | |
831 | std r11,112(r4) | |
832 | ld r11,48(r6) | |
833 | std r12,120(r4) | |
834 | ld r12,56(r6) | |
835 | addi r4,r4,128 // advance to next dest chunk | |
836 | c64InnerLoopEntryPt: // initial entry into loop, with 1st halfline loaded | |
837 | bt noncache,c64InnerLoop1 // skip if uncached or overlap | |
838 | dcbz128 0,r4 // avoid prefetch of next cache line | |
839 | c64InnerLoop1: | |
840 | std r0,0(r4) // store 1st half of chunk n | |
841 | ld r0,64(r6) // load 2nd half of chunk n | |
842 | std r2,8(r4) | |
843 | ld r2,72(r6) | |
844 | std r7,16(r4) | |
845 | ld r7,80(r6) | |
846 | std r8,24(r4) | |
847 | ld r8,88(r6) | |
848 | std r9,32(r4) | |
849 | ld r9,96(r6) | |
850 | std r10,40(r4) | |
851 | ld r10,104(r6) | |
852 | std r11,48(r4) | |
853 | ld r11,112(r6) | |
854 | std r12,56(r4) | |
855 | ld r12,120(r6) | |
856 | addi r6,r6,128 // advance to next source chunk if any | |
857 | bdnz c64InnerLoop // loop if more chunks | |
858 | ||
859 | std r0,64(r4) // store 2nd half of last chunk | |
860 | std r2,72(r4) | |
861 | std r7,80(r4) | |
862 | std r8,88(r4) | |
863 | std r9,96(r4) | |
864 | std r10,104(r4) | |
865 | std r11,112(r4) | |
866 | std r12,120(r4) | |
867 | addi r4,r4,128 // advance to next dest chunk | |
868 | ||
869 | c64double7: // r5 <- leftover bytes, cr1 set on doubleword count | |
870 | rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15) | |
871 | andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7) | |
872 | beq cr1,c64byte // no leftover doublewords | |
873 | mtctr r0 | |
874 | b c64double8 | |
875 | ||
876 | .align 5 // align inner loop | |
877 | c64double8: // loop copying leftover doublewords | |
878 | ld r0,0(r6) | |
879 | addi r6,r6,8 | |
880 | std r0,0(r4) | |
881 | addi r4,r4,8 | |
882 | bdnz c64double8 | |
883 | ||
884 | ||
885 | // Forward byte loop. | |
886 | ||
887 | c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached) | |
888 | beq bcpydone // done if no leftover bytes | |
889 | mtctr r5 | |
890 | b c64byte1 | |
891 | ||
892 | .align 5 // align inner loop | |
893 | c64byte1: | |
894 | lbz r0,0(r6) | |
895 | addi r6,r6,1 | |
896 | stb r0,0(r4) | |
897 | addi r4,r4,1 | |
898 | bdnz c64byte1 | |
899 | ||
900 | b bcpydone | |
901 | ||
902 | ||
903 | // Uncached copies. We must avoid unaligned accesses, since they always take alignment | |
904 | // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands | |
905 | // a byte at a time, but that is still much faster than alignment exceptions. | |
906 | // r4-r6 = dest, length, source (as above) | |
907 | // r2 = mask of 1s for leading 0s in length, plus 1 extra 1 | |
908 | // r7 = #bytes to copy to 128-byte align dest (limited by operand length) | |
909 | // cr1 = blt if reverse move required | |
910 | ||
911 | c64uncached: | |
912 | xor r0,r6,r4 // get relative alignment | |
913 | rlwinm r10,r0,0,29,31 // relatively doubleword aligned? | |
914 | rlwinm r11,r0,0,30,31 // relatively word aligned? | |
915 | not r8,r2 // get mask to limit initial length of copy for G4word | |
916 | blt cr1,c64reverseUncached | |
917 | ||
918 | cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned | |
919 | cmpwi cr1,r11,0 // set cr1 beq if word aligned | |
920 | beq cr0,c64double // doubleword aligned | |
921 | beq cr1,G4word // word aligned, use G3/G4 code | |
922 | cmpwi r5,0 // set cr0 on byte count | |
923 | b c64byte // unaligned operands | |
924 | ||
925 | c64reverseUncached: | |
926 | cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned | |
927 | cmpwi cr1,r11,0 // set cr1 beq if word aligned | |
928 | beq cr0,c64rdouble // doubleword aligned so can use LD/STD | |
929 | beq cr1,G4reverseWord // word aligned, use G3/G4 code | |
930 | add r6,r6,r5 // point to (end+1) of source and dest | |
931 | add r4,r4,r5 | |
932 | cmpwi r5,0 // set cr0 on length | |
933 | b c64rbyte // copy a byte at a time | |
934 | ||
935 | ||
936 | ||
937 | // Reverse doubleword copies. This is used for all cached copies, and doubleword | |
938 | // aligned uncached copies. | |
939 | // r4 = destination (32 or 64-bit ptr) | |
940 | // r5 = length (always 32 bits) | |
941 | // r6 = source (32 or 64-bit ptr) | |
942 | // cr5 = noncache, fixxlate, and restorex flags set | |
943 | ||
944 | c64rdouble: | |
945 | add r6,r6,r5 // point to (end+1) of source and dest | |
946 | add r4,r4,r5 | |
947 | rlwinm. r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest | |
948 | cmplw cr1,r7,r5 // operand long enough to doubleword align? | |
949 | blt cr1,c64rd0 // yes | |
950 | mr r7,r5 // no | |
951 | c64rd0: | |
952 | sub r5,r5,r7 // adjust length | |
953 | srwi r8,r5,6 // r8 <- 64-byte chunks to xfer | |
954 | cmpwi cr1,r8,0 // any chunks? | |
955 | beq c64rd2 // source already doubleword aligned | |
956 | mtctr r7 | |
957 | ||
958 | c64rd1: // copy bytes until source doublword aligned | |
959 | lbzu r0,-1(r6) | |
960 | stbu r0,-1(r4) | |
961 | bdnz c64rd1 | |
962 | ||
963 | c64rd2: // r8/cr1 <- count of 64-byte chunks | |
964 | rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords | |
965 | andi. r5,r5,7 // r5/cr0 <- count of leftover bytes | |
966 | cmpwi cr7,r0,0 // leftover doublewords? | |
967 | beq cr1,c64rd4 // no chunks to xfer | |
968 | li r9,-128 // start at next cache line | |
969 | mtctr r8 | |
970 | bt noncache,c64rd3 // (HACK) Do not start a stream if noncachable... | |
971 | dcbt128 r9,r6,3 // start reverse stream | |
972 | b c64rd3 | |
973 | ||
974 | .align 5 // align inner loop | |
975 | c64rd3: // loop copying 64-byte chunks | |
976 | ld r7,-8(r6) | |
977 | ld r8,-16(r6) | |
978 | ld r9,-24(r6) | |
979 | ld r10,-32(r6) | |
980 | ld r11,-40(r6) | |
981 | ld r12,-48(r6) | |
982 | std r7,-8(r4) | |
983 | std r8,-16(r4) | |
984 | ld r7,-56(r6) | |
985 | ldu r8,-64(r6) | |
986 | std r9,-24(r4) | |
987 | std r10,-32(r4) | |
988 | std r11,-40(r4) | |
989 | std r12,-48(r4) | |
990 | std r7,-56(r4) | |
991 | stdu r8,-64(r4) | |
992 | bdnz c64rd3 | |
993 | ||
994 | c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes | |
995 | beq cr7,c64rbyte // no leftover doublewords | |
996 | mtctr r0 | |
997 | ||
998 | c64rd5: // loop copying leftover doublewords | |
999 | ldu r0,-8(r6) | |
1000 | stdu r0,-8(r4) | |
1001 | bdnz c64rd5 | |
1002 | ||
1003 | ||
1004 | // Reverse byte loop. | |
1005 | ||
1006 | c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached) | |
1007 | beq bcpydone // done if no leftover bytes | |
1008 | mtctr r5 | |
1009 | ||
1010 | c64rbyte1: | |
1011 | lbzu r0,-1(r6) | |
1012 | stbu r0,-1(r4) | |
1013 | bdnz c64rbyte1 | |
1014 | ||
1015 | b bcpydone | |
1016 |