]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/ppc/xsumas.s
xnu-344.23.tar.gz
[apple/xnu.git] / bsd / dev / ppc / xsumas.s
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 #define STANDALONE 0
23
24 #if STANDALONE
25 #include "asm.h"
26 #include "assym.h"
27 #include "proc_reg.h" /* For CACHE_LINE_SIZE */
28
29 #else
30
31 #include <mach/ppc/asm.h>
32 #if 0
33 /* #include <assym.h> */
34 #include <ppc/proc_reg.h> /* For CACHE_LINE_SIZE */
35 #endif 0
36 #endif
37
38 /*
39 * Reg 3 - Pointer to data
40 * Reg 4 - Length of data
41 * Reg 5 - Accumulated sum value
42 * Reg 6 - Starting on odd boundary flag (relative to byte 0 of the checksumed data)
43 */
44
45 ENTRY(xsum_assym, TAG_NO_FRAME_USED)
46
47 mr r11, r6 ; Swapped flag
48 addi r8, 0, 0
49 addi r10, 0, 0x1f
50 addi r7, 0, 1
51 addic r7, r7, 0 ; This clears the carry bit!
52 mr r12, r5 ; Save the passed-in checksum value
53
54 /*
55 * Sum bytes before cache line boundary
56 */
57
58 cmpi cr0,0,r4,0 ; Check for length of 0
59 beq Lleftovers
60
61 and. r9, r3, r10
62 beq Laligned32 ; 32 byte aligned
63
64 andi. r9, r3, 0x3
65 beq Laligned4
66
67 andi. r9, r3, 0x1
68 beq Laligned2 ; 2 byte aligned
69
70 addi r11, 0, 1 ; swap bytes at end
71 lbz r8, 0(r3)
72 add r3, r3, r7
73 subf. r4, r7, r4
74 beq Ldone
75
76 Laligned2:
77 cmpi cr0,0,r4,2 ; If remaining length is less than two - go to wrap-up
78 blt Lleftovers
79 andi. r9, r3, 0x3 ; If aligned on a 4-byte boundary, go to that code
80 beq Laligned4
81 lhz r5, 0(r3) ; Load and add a halfword to the checksum
82 adde r8, r8, r5
83 slwi r7, r7, 1
84 add r3, r3, r7
85 subf. r4, r7, r4
86 beq Ldone
87
88
89 /*
90 Add longwords up to the 32 byte boundary
91 */
92
93 Laligned4:
94 addi r7, 0, 4
95 Lloop4:
96 cmpi cr0,0,r4,4
97 blt Lleftovers
98 and. r9, r3, r10
99 beq Laligned32
100 lwz r5, 0(r3)
101 adde r8, r8, r5
102 add r3, r3, r7
103 subf. r4, r7, r4
104 bne Lloop4
105 b Ldone
106
107
108 /*
109 We're aligned on a 32 byte boundary now - add 8 longwords to checksum
110 until the remaining length is less than 32
111 */
112 Laligned32:
113 andis. r6, r4, 0xffff
114 bne Lmainloop
115 andi. r6, r4, 0xffe0
116 beq Lleftovers
117
118 Lmainloop:
119 addi r9, 0, 64
120 addi r10, 0, 32
121 cmpi cr0,0,r4,64
122 blt Lnopretouch
123 dcbt r3, r10 ; Touch one cache-line ahead
124 Lnopretouch:
125 lwz r5, 0(r3)
126
127 /*
128 * This is the main meat of the checksum. I attempted to arrange this code
129 * such that the processor would execute as many instructions as possible
130 * in parallel.
131 */
132
133 Lloop:
134 cmpi cr0,0,r4,96
135 blt Lnotouch
136 dcbt r3, r9 ; Touch two cache lines ahead
137 Lnotouch:
138 adde r8, r8, r5
139 lwz r5, 4(r3)
140 lwz r6, 8(r3)
141 lwz r7, 12(r3)
142 adde r8, r8, r5
143 lwz r5, 16(r3)
144 adde r8, r8, r6
145 lwz r6, 20(r3)
146 adde r8, r8, r7
147 lwz r7, 24(r3)
148 adde r8, r8, r5
149 lwz r5, 28(r3)
150 add r3, r3, r10
151 adde r8, r8, r6
152 adde r8, r8, r7
153 adde r8, r8, r5
154 subf r4, r10, r4
155 andi. r6, r4, 0xffe0
156 beq Lleftovers
157 lwz r5, 0(r3)
158 b Lloop
159
160 /*
161 * Handle whatever bytes are left
162 */
163
164 Lleftovers:
165 /*
166 * Handle leftover bytes
167 */
168 cmpi cr0,0,r4,0
169 beq Ldone
170
171 addi r7, 0, 1
172 addi r10, 0, 0x7ffc
173
174 and. r9, r4, r10
175 bne Lfourormore
176 srw r10, r10, r7
177 and. r9, r4, r10
178 bne Ltwoormore
179 b Loneleft
180
181 Lfourormore:
182 addi r10, 0, 4
183
184 Lfourloop:
185 lwz r5, 0(r3)
186 adde r8, r8, r5
187 add r3, r3, r10
188 subf r4, r10, r4
189 andi. r6, r4, 0xfffc
190 bne Lfourloop
191
192 Ltwoormore:
193 andi. r6, r4, 0xfffe
194 beq Loneleft
195 lhz r5, 0(r3)
196 adde r8, r8, r5
197 addi r3, r3, 2
198 subi r4, r4, 2
199
200 Loneleft:
201 cmpi cr0,0,r4,0
202 beq Ldone
203 lbz r5, 0(r3)
204 slwi r5, r5, 8
205 adde r8, r8, r5
206
207 /*
208 * Wrap the longword around, adding the two 16-bit portions
209 * to each other along with any previous and subsequent carries.
210 */
211 Ldone:
212 addze r8, r8 ; Add the carry
213 addze r8, r8 ; Add the carry again (the last add may have carried)
214 andis. r6, r8, 0xffff ; Stuff r6 with the high order 16 bits of sum word
215 srwi r6, r6, 16 ; Shift it to the low order word
216 andi. r8, r8, 0xffff ; Zero out the high order word
217 add r8, r8, r6 ; Add the two halves
218
219 andis. r6, r8, 0xffff ; Do the above again in case we carried into the
220 srwi r6, r6, 16 ; high order word with the last add.
221 andi. r8, r8, 0xffff
222 add r3, r8, r6
223
224 cmpi cr0,0,r11,0 ; Check to see if we need to swap the bytes
225 beq Ldontswap
226
227 /*
228 * Our buffer began on an odd boundary, so we need to swap
229 * the checksum bytes.
230 */
231 slwi r8, r3, 8 ; shift byte 0 to byte 1
232 clrlwi r8, r8, 16 ; Clear top 16 bits
233 srwi r3, r3, 8 ; shift byte 1 to byte 0
234 or r3, r8, r3 ; or them
235
236 Ldontswap:
237 add r3, r3, r12 ; Add in the passed-in checksum
238 andis. r6, r3, 0xffff ; Wrap and add any carries into the top 16 bits
239 srwi r6, r6, 16
240 andi. r3, r3, 0xffff
241 add r3, r3, r6
242
243 andis. r6, r3, 0xffff ; Do the above again in case we carried into the
244 srwi r6, r6, 16 ; high order word with the last add.
245 andi. r3, r3, 0xffff
246 add r3, r3, r6
247 blr
248
249