]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/ppc/xsumas.s
xnu-792.10.96.tar.gz
[apple/xnu.git] / bsd / dev / ppc / xsumas.s
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #define kShort 11
24 #define cr1_gt 5 // bit 1 of cr1
25
26 /*
27 * short xsum_assym( short *p, int len, short xsum, boolean odd);
28 *
29 * r3 - Pointer to data
30 * r4 - Length of data
31 * r5 - Accumulated sum value
32 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
33 *
34 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
35 * correctly handle the case where the flag is set and the address is odd.
36 *
37 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
38 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
39 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
40 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
41 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
42 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
43 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
44 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
45 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
46 * is set on the low 32-bits of the sum.)
47 *
48 * Using Altivec is tempting, but the performance impact of the greatly increased
49 * number of exceptions and register save/restore traffic probably make it impractical
50 * for now.
51 */
52 .globl _xsum_assym
53 .globl _xsum_nop_if_32bit
54 .text
55 .align 5
56 _xsum_assym:
57 cmplwi cr0,r4,kShort ; too short to word align?
58 rlwinm r2,r3,0,0x3 ; get byte offset in word
59 dcbt 0,r3 ; touch in 1st cache line
60 cmpwi cr6,r2,0 ; is address word aligned?
61 ble cr0,Lshort ; skip if too short to bother aligning
62
63 subfic r0,r2,4 ; get #bytes in partial word
64 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
65 addic r0,r0,0 ; turn off carry
66 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
67
68 ; Partial word at start: zero filled on left, it becomes initial checksum.
69
70 rlwinm r3,r3,0,0,29 ; word align address
71 mtcrf 0x01,r2 ; move byte offset to cr7
72 lwz r6,0(r3) ; get partial word
73 li r7,-1 ; start of mask for partial fill
74 slwi r8,r2,3 ; multiply byte offset by 8
75 sub r4,r4,r0 ; adjust length for bytes in partial word
76 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
77 srw r7,r7,r8 ; get mask for bytes to keep in partial word
78 addi r3,r3,4 ; point to next word of input
79 and r2,r6,r7 ; zero fill on left
80
81 ; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
82 ; r2 = initial checksum
83 ; r3 = word aligned address
84 ; r4 = length remaining
85 ; r5 = accumulated sum parameter
86 ; carry = off
87 ; cr1_gt = "starting on odd address" flag
88
89 Laligned:
90 srwi. r0,r4,5 ; get count of 32-byte chunks
91 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
92 mtcrf 0x01,r4
93 beq cr0,Lleftovers ; no chunks
94
95 mtctr r0 ; set up loop count
96 li r4,32 ; offset to next chunk
97 _xsum_nop_if_32bit:
98 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
99 dcbt r4,r3 ; touch in 2nd cache line
100 li r0,96 ; get touch offset
101 b LInnerLoop32 ; enter 32-bit loop
102
103 ; Inner loop for 32-bit machines.
104
105 .align 4
106 LInnerLoop32:
107 lwz r4,0(r3)
108 lwz r6,4(r3)
109 lwz r7,8(r3)
110 lwz r8,12(r3)
111 adde r2,r2,r4
112 lwz r9,16(r3)
113 adde r2,r2,r6
114 lwz r10,20(r3)
115 adde r2,r2,r7
116 lwz r11,24(r3)
117 adde r2,r2,r8
118 lwz r12,28(r3)
119 adde r2,r2,r9
120 dcbt r3,r0
121 adde r2,r2,r10
122 addi r3,r3,32
123 adde r2,r2,r11
124 adde r2,r2,r12
125 bdnz+ LInnerLoop32
126
127 ; Handle leftover bytes.
128 ; r2 = checksum so far
129 ; r3 = word aligned address
130 ; r5 = accumulated sum parameter
131 ; carry = live
132 ; cr1_gt = "starting on odd address" flag
133 ; cr6,cr7 = residual length
134
135 Lleftovers:
136 bf 27,Lleftover8 ; test 0x10 bit of residual length
137 lwz r4,0(r3)
138 lwz r6,4(r3)
139 lwz r7,8(r3)
140 lwz r8,12(r3)
141 addi r3,r3,16
142 adde r2,r2,r4
143 adde r2,r2,r6
144 adde r2,r2,r7
145 adde r2,r2,r8
146 Lleftover8:
147 bf 28,Lleftover4
148 lwz r4,0(r3)
149 lwz r6,4(r3)
150 addi r3,r3,8
151 adde r2,r2,r4
152 adde r2,r2,r6
153 Lleftover4:
154 bf 29,Lleftover2
155 lwz r4,0(r3)
156 addi r3,r3,4
157 adde r2,r2,r4
158 Lleftover2:
159 bf 30,Lleftover1
160 lhz r4,0(r3)
161 addi r3,r3,2
162 adde r2,r2,r4
163 Lleftover1:
164 bf 31,Lwrapup
165 lbz r4,0(r3)
166 slwi r4,r4,8 ; shift last byte into proper lane
167 adde r2,r2,r4
168
169 ; All data bytes checksummed. Wrap up.
170 ; r2 = checksum so far (word parallel)
171 ; r5 = accumulated sum parameter
172 ; carry = live
173 ; cr1_gt = "starting on odd address" flag
174
175 Lwrapup:
176 addze r2,r2 ; add in last carry
177 addze r2,r2 ; in case the "addze" carries
178 Lwrapupx: ; here from short-operand case, with xer(ca) undefined
179 srwi r6,r2,16 ; top half of 32-bit checksum
180 rlwinm r7,r2,0,0xFFFF ; lower half
181 add r2,r6,r7 ; add them together
182 srwi r6,r2,16 ; then do it again, in case first carried
183 rlwinm r7,r2,0,0xFFFF
184 add r2,r6,r7
185 bf cr1_gt,Lswapped ; test "starting on odd address" flag
186
187 ; The checksum began on an odd address, so swap bytes.
188
189 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
190 rlwinm r7,r2,8,0xFF00 ; bottom to top
191 or r2,r6,r7 ; rejoin
192
193 ; Finally, add in checksum passed in as a parameter.
194
195 Lswapped:
196 add r2,r2,r5 ; add passed-in checksum
197 srwi r6,r2,16 ; top half of 32-bit checksum
198 rlwinm r7,r2,0,0xFFFF ; lower half
199 add r2,r6,r7 ; add them together
200 srwi r6,r2,16 ; then do it again, in case first carried
201 rlwinm r7,r2,0,0xFFFF
202 add r3,r6,r7 ; steer result into r3
203 blr
204
205 ; Handle short operands. Do a halfword at a time.
206 ; r3 = address
207 ; r4 = length (<= kShort)
208 ; r5 = accumulated sum parameter
209 ; r6 = "starting on odd byte" flag
210
211 Lshort:
212 cmpwi cr6,r4,2 ; at least two bytes?
213 andi. r0,r4,1 ; odd length?
214 li r2,0 ; initialize checksum
215 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
216 blt cr6,Lshort2 ; fewer than two bytes, so skip
217 Lshort1:
218 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
219 lhz r7,0(r3)
220 subi r4,r4,2
221 addi r3,r3,2
222 add r2,r2,r7 ; note no need for "adde"
223 bge cr6,Lshort1 ; loop for 2 more bytes
224 Lshort2:
225 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
226 lbz r7,0(r3)
227 slwi r7,r7,8 ; shift last byte into proper lane
228 add r2,r2,r7
229 b Lwrapupx
230
231 ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
232 ; four parallel 32-bit accumulators, which carry into the upper half naturally so we
233 ; do not have to use "adde", which serializes on the carry bit. Note that we cannot
234 ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
235 ; r2 = checksum so far (ie, the zero-filled partial first word)
236 ; r3 = word aligned address
237 ; r5 = accumulated sum parameter
238 ; ctr = number of 32-byte chunks of input
239 ; carry = unused in this code
240 ; cr1_gt = "starting on odd address" flag
241 ; cr6,cr7 = residual length
242
243 L64BitPath:
244 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
245 stw r14,-8(r1)
246 stw r15,-12(r1)
247 stw r16,-16(r1)
248 li r0,128 ; to touch next line
249 li r13,0 ; r13-r15 are the accumulators, so initialize them
250 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
251 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
252 lwz r6,4(r3)
253 lwz r7,8(r3)
254 mr r14,r2 ; just copy incoming partial word into one of the accumulators
255 li r15,0
256 lwz r8,12(r3)
257 lwz r9,16(r3)
258 li r16,0
259 li r0,256 ; get touch offset
260 lwz r10,20(r3)
261 lwz r11,24(r3)
262 lwz r12,28(r3) ; load last word of previous chunk
263 addi r3,r3,32 ; skip past the chunk
264 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
265
266 b LAddLastChunk ; only one chunk
267
268 ; Inner loop for 64-bit processors. This loop is scheduled for the 970.
269 ; It is pipelined (loads are one iteration ahead of adds), and unrolled.
270 ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
271
272 .align 5
273 LInnerLoop64: ; 64 bytes/iteration
274 add r13,r13,r4 ; cycle 1
275 add r14,r14,r6
276 dcbt r3,r0 ; touch in 2 lines ahead
277 lwz r4,0(r3)
278
279 add r15,r15,r7 ; cycle 2, etc
280 lwz r6,4(r3)
281 lwz r7,8(r3)
282 add r16,r16,r8
283
284 lwz r8,12(r3)
285 add r13,r13,r9
286 add r14,r14,r10
287 lwz r9,16(r3)
288
289 add r15,r15,r11
290 lwz r10,20(r3)
291 lwz r11,24(r3)
292 add r16,r16,r12
293 bdz-- LEarlyExit ; early exit if no more chunks
294
295 lwz r12,28(r3)
296 add r13,r13,r4
297 add r14,r14,r6
298 lwz r4,32(r3)
299
300 add r15,r15,r7
301 lwz r6,36(r3)
302 lwz r7,40(r3)
303 add r16,r16,r8
304
305 lwz r8,44(r3)
306 add r13,r13,r9
307 add r14,r14,r10
308 lwz r9,48(r3)
309
310 add r15,r15,r11
311 lwz r10,52(r3)
312 lwz r11,56(r3)
313 add r16,r16,r12
314
315 nop ; position last load in 2nd dispatch slot
316 lwz r12,60(r3)
317 addi r3,r3,64
318 bdnz++ LInnerLoop64
319
320 b LAddLastChunk
321
322 ; Add in the last 32-byte chunk, and any leftover bytes.
323 ; r3 = word aligned address of next byte of data
324 ; r5 = accumulated sum parameter
325 ; r13-r16 = the four accumulators
326 ; cr1_gt = "starting on odd address" flag
327 ; cr6,cr7 = residual length
328
329 LEarlyExit: ; here from middle of inner loop
330 lwz r12,28(r3) ; load last word of last chunk
331 addi r3,r3,32
332 LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
333 add r13,r13,r4 ; add in last chunk
334 add r14,r14,r6 ; these are 64-bit adds
335 add r15,r15,r7
336 add r16,r16,r8
337 add r13,r13,r9
338 add r14,r14,r10
339 add r15,r15,r11
340 add r16,r16,r12
341
342 ; Handle leftover bytes, if any.
343
344 bf 27,Lleft1 ; test 0x10 bit of residual length
345 lwz r4,0(r3)
346 lwz r6,4(r3)
347 lwz r7,8(r3)
348 lwz r8,12(r3)
349 addi r3,r3,16
350 add r13,r13,r4
351 add r14,r14,r6
352 add r15,r15,r7
353 add r16,r16,r8
354 Lleft1:
355 bf 28,Lleft2
356 lwz r4,0(r3)
357 lwz r6,4(r3)
358 addi r3,r3,8
359 add r13,r13,r4
360 add r14,r14,r6
361 Lleft2:
362 bf 29,Lleft3
363 lwz r4,0(r3)
364 addi r3,r3,4
365 add r14,r14,r4
366 Lleft3:
367 bf 30,Lleft4
368 lhz r4,0(r3)
369 addi r3,r3,2
370 add r15,r15,r4
371 Lleft4:
372 bf 31,Lleft5
373 lbz r4,0(r3)
374 slwi r4,r4,8 ; shift last byte into proper lane
375 add r16,r16,r4
376
377 ; All data bytes have been checksummed. Now we must add together the four
378 ; accumulators and restore the regs from the red zone.
379 ; r3 = word aligned address of next byte of data
380 ; r5 = accumulated sum parameter
381 ; r13-r16 = the four accumulators
382 ; carry = not used so far
383 ; cr1_gt = "starting on odd address" flag
384
385 Lleft5:
386 add r8,r13,r14 ; add the four accumulators together
387 add r9,r15,r16
388 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
389 lwz r14,-8(r1)
390 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
391 lwz r15,-12(r1)
392 lwz r16,-16(r1)
393 srdi r7,r8,32 ; get upper half of 64-bit sum
394 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
395 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum