1 // $Id: WKdmDecompress.intel.s,v 1.1 2010/01/30 00:39:21 cclee Exp cclee $
3 // This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Decompressor.
4 // The implementation is derived by compiling (gcc -O3) the original C code (WKdmDecompress.c)
5 // followed by hand tweaking of the compiled assembly code.
12 .globl _WKdm_decompress
15 // save registers, set up base pointer %ebp, and allocate stack memory for local veriables
24 // PRELOAD_DICTIONARY; dictionary starting address : -88(%ebp)
42 #define dictionary_addr -88(%ebp)
43 #define TAGS_AREA_END -7292(%ebp)
44 #define tempTagsArray -7300(%ebp)
45 #define tempQPosArray -2488(%ebp)
46 #define tempLowBitsArray -7288(%ebp)
47 #define next_low_bits -7296(%ebp)
48 #define dictionary -7308(%ebp)
49 #define tag_area_end -7304(%ebp)
51 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
53 movl 8(%ebp), %eax // src_buf
54 addl $272, %eax // src_buf + 16 (WKdm Header) + 256 (Tags)
55 movl %eax, TAGS_AREA_END // TAGS_AREA_END(src_buf)
56 movl 8(%ebp), %eax // src_buf
57 movl %eax, %edi // src_buf
58 addl $16, %eax // TAGS_AREA_START(src_buf) = src_buf + 16 (WKdm Header)
59 leal -1288(%ebp), %edx // tempTagsArray
60 movl %edx, tempTagsArray // save a copy of tempTagsArray[] at the said location
61 cmpl %eax, TAGS_AREA_END // TAGS_AREA_END vs TAGS_AREA_START
62 jbe 1f // if TAGS_AREA_END<=TAGS_AREA_START, no need for WK_unpack_2bits
63 movl %edx, %ecx // %ecx -> tempTagsArray[0]
64 xorl %esi, %esi // i=0
65 movl $50529027, %ebx // 0x03030303, mask to extract 4 2-bit tags
68 movl 16(%edi,%esi,4), %edx // src_buf[i] for 16 tags, 16 (WKdm header)
69 movl %edx, %eax // w = src_buf[i]
70 andl %ebx, %eax // 1st 4 tags, each in bytes
71 movl %eax, (%ecx) // save 1st 4 tags
72 movl %edx, %eax // w = src_buf[i]
73 shrl $2, %eax // shift down 2 bits
74 andl %ebx, %eax // 2nd 4 tags, each in bytes
75 movl %eax, 4(%ecx) // save 2nd 4 tags
76 shrl $4, %edx // shift down w by 4 bits
77 movl %edx, %eax // w>>4
78 andl %ebx, %eax // 3rd 4 tags
79 movl %eax, 8(%ecx) // save 3rd 4 tags
81 andl %ebx, %edx // 4th 4 tags
82 movl %edx, 12(%ecx) // save 4th 4 tags
83 addl $16, %ecx // point to next tempTagsArray[i*16]
85 cmpl $64, %esi // i vs 64
86 jne L_WK_unpack_2bits // repeat the loop until i==64
89 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
91 movl 8(%edi), %eax // WKdm header qpos end
92 leal (%edi,%eax,4), %esi // QPOS_AREA_END
93 movl 4(%edi), %eax // WKdm header qpos start
94 leal (%edi,%eax,4), %ecx // QPOS_AREA_START
95 cmpl %ecx, %esi // QPOS_AREA_END vs QPOS_AREA_START
96 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip WK_unpack_4bits
97 leal tempQPosArray, %edi // tempQPosArray
98 movl $252645135, %ebx // 0x0f0f0f0f : mask to extract 4 4-bit qpos
100 movl (%ecx), %eax // w
102 andl %ebx, %edx // 1st 4 qpos
103 movl %edx, (%edi) // save 1st 4 qpos
104 shrl $4, %eax // w>>4
105 andl %ebx, %eax // 2nd 4 qpos
106 movl %eax, 4(%edi) // save 2nd 4 qpos
107 addl $4, %ecx // point to next word w
108 addl $8, %edi // qpos += 8
109 cmpl %ecx, %esi // QPOS_AREA_END vs qpos_pointer
110 ja L_WK_unpack_4bits // repeat until qpos_pointer >= QPOS_AREA_END
112 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
115 movl 8(%ebp), %edx // src_buf
116 movl 12(%edx), %eax // LOW_BITS_AREA_END offset
117 leal (%edx,%eax,4), %edi // LOW_BITS_AREA_END
118 cmpl %edi, %esi // LOW_BITS_AREA_START(=QPOS_AREA_END) vs LOW_BITS_AREA_END
119 jae 1f // if (LOW_BITS_AREA_START>=LOW_BITS_AREA_END) skip unpack_3_tenbits
120 leal tempLowBitsArray, %ecx // tempLowBitsArray
121 movl $1023, %ebx // 0x03ff to extact lower 10-bits
124 L_WK_unpack_3_tenbits:
125 movl (%esi), %eax // w = *next_low_bits
127 andl %ebx, %edx // 1st 10-bit
128 movl %edx, (%ecx) // save 1st 10-bit
129 shrl $10, %eax // (w>>10)
130 movl %eax, %edx // (w>>10)
131 andl %ebx, %edx // 2nd 10-bit
132 movl %edx, 4(%ecx) // save 2nd 10-bit
133 shrl $10, %eax // (w>>20), no need to and with mask, the top 2 bits should be zero
134 movl %eax, 8(%ecx) // save 3rd 10-bits
135 addl $4, %esi // point to next w
136 addl $12, %ecx // tempLowBitsArray += 3;
137 cmpl %esi, %edi // LOW_BITS_AREA_END vs next_low_bits
138 ja L_WK_unpack_3_tenbits // repeat until next_low_bits>=LOW_BITS_AREA_END
142 popl %ebx // set up %ebx for use in Hash Table loopup[
144 #define next_tag %esi
145 #define next_qpos %edi
147 movl tempTagsArray, next_tag // next_tag = tempTagsArray
148 leal tempQPosArray, next_qpos // next_qpos = tempQPosArray
149 movl 12(%ebp), %ecx // dest_buf
150 addl $4, %ecx // for some reason, performance is better if we points to the next one
151 leal tempLowBitsArray, %eax // tempLowBitsArray
152 movl %eax, next_low_bits // next_low_bits = next_low_bits;
153 leal -264(%ebp), %edx
154 movl %edx, tag_area_end // tag_area_end
155 leal dictionary_addr, %eax // dictionary starting address
156 movl %eax, dictionary // dictionary
161 cmpb $2, %al // MISS_TAG
164 movsbl (next_qpos),%eax // qpos = *next_qpos
165 incl next_qpos // next_qpos++
166 movl dictionary, %edx // dictionary
167 movl (%edx,%eax,4), %eax // w = dictionary[qpos]
168 movl %eax, -4(%ecx) // *dest_buf = w
171 incl next_tag // next_tag++
172 addl $4, %ecx // dest_buf++
173 cmpl tag_area_end, next_tag // next_tag vs tag_area_end
174 jae L_done // if (next_tag>=tag_area_end)
176 movzbl (next_tag), %eax // tag = *next_tag
177 cmpb $1, %al // Partial match?
180 movsbl (next_qpos),%edx // qpos = *next_qpos
181 movl dictionary, %eax // dictionary
182 leal (%eax,%edx,4), %edx // dict_location = &dictionary[qpos]
183 movl %edx, -7324(%ebp) // save dict_location to release %edx
184 incl next_qpos // next_qpos++
185 movl (%edx), %eax // read dictionary word
186 andl $-1024, %eax // keep only higher 22-bits
187 movl next_low_bits, %edx // low_bits = *next_low_bits
188 orl (%edx), %eax // construct the new partially matched word
190 movl %edx, next_low_bits // next_low_bits++
191 movl -7324(%ebp), %edx // dict_location
192 movl %eax, (%edx) // update *dict_location with the newly constructed word
193 movl %eax, -4(%ecx) // *dest_buf = the newly constructed word
194 incl next_tag // next_tag++
195 addl $4, %ecx // dest_buf++
196 cmpl tag_area_end, next_tag // next_tag vs tag_area_end
197 jb L11 // if next_tag < tag_area_end, repeat the loop
200 // release stack memory, restore registers, and return
208 #define next_full_patt -7292(%ebp) /* next_full_patt starts with initial value of TAGS_AREA_END */
212 movl next_full_patt, %edx // next_full_patt
213 movl (%edx), %eax // word = *next_full_patt
214 addl $4, %edx // next_full_patt++
215 movl %edx, next_full_patt // save next_full_patt
216 movl %eax, %edx // word
217 shrl $10, %edx // word>>10
218 andl $255, %edx // 8-bit hash table index
219 movsbl _hashLookupTable-Lhash(%ebx,%edx),%edx // qpos
220 movl %eax, -88(%ebp,%edx) // dictionary[qpos] = word
221 movl %eax, -4(%ecx) // *dest_buf = word
222 jmp L_next // repeat the loop
226 movl $0, -4(%ecx) // *dest_buf = 0
227 jmp L_next // repeat the loop
231 #if defined __x86_64__
237 .globl _WKdm_decompress
240 // save registers, and allocate stack memory for local variables
248 movq %rsi, %r12 // dest_buf
250 // PRELOAD_DICTIONARY; dictionary starting address : starting address -80(%rpb)
268 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
269 leaq 272(%rdi), %r10 // TAGS_AREA_END
270 leaq 16(%rdi), %rax // TAGS_AREA_START
271 leaq -1280(%rbp), %rsi // tempTagsArray
272 cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START
273 jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits
274 movq %rsi, %rcx // next_word
275 xorl %r8d, %r8d // i = 0
278 movl 16(%rdi,%r8,4), %edx // w = *next_word
280 andl $50529027, %eax // 1st 4 tags
281 movl %eax, (%rcx) // write 1st 4 tags
283 shrl $2, %eax // w>>2
284 andl $50529027, %eax // 2nd 4 tags
285 movl %eax, 4(%rcx) // write 2nd 4 tags
286 shrl $4, %edx // w>>4
287 movl %edx, %eax // w>>4
288 andl $50529027, %eax // 3rd 4 tags
289 movl %eax, 8(%rcx) // write 3rd 4 tags
290 shrl $2, %edx // w>>6
291 andl $50529027, %edx // 4th 4 tags
292 movl %edx, 12(%rcx) // write 4th 4 tags
293 addq $16, %rcx // next_tags += 16
295 cmpq $64, %r8 // i vs 64
296 jne L_WK_unpack_2bits // repeat loop until i==64
299 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
301 mov 8(%rdi), %eax // WKdm header qpos end
302 leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END
303 mov 4(%rdi), %eax // WKdm header qpos start
304 leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START
305 leaq -2480(%rbp), %rbx // tempQPosArray
306 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
307 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
308 leaq 8(%rbx), %rcx // next_qpos
310 movl (%r8), %eax // w = *next_word
312 andl $252645135, %edx // 1st 4 qpos
313 movl %edx, -8(%rcx) // write 1st 4 qpos
314 shrl $4, %eax // w>>4
315 andl $252645135, %eax // 2nd 4 qpos
316 movl %eax, -4(%rcx) // write 2nd 4 qpos
317 addq $4, %r8 // next_word++
318 addq $8, %rcx // next_qpos+=8
319 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
320 ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START
323 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
325 mov 12(%rdi), %eax // LOW_BITS_AREA_END offset
326 leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END
327 leaq -7280(%rbp), %r11 // tempLowBitsArray
328 cmpq %rdi, %r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
329 jae 1f // if START>=END, skip L_WK_unpack_3_tenbits
330 leaq 12(%r11), %rcx // next_low_bits
331 L_WK_unpack_3_tenbits:
332 movl (%r9), %eax // w = *next_word
334 andl $1023, %edx // 1st tenbits
335 movl %edx, -12(%rcx) // write 1st tenbits
336 shrl $10, %eax // w >> 10
337 movl %eax, %edx // w >> 10
338 andl $1023, %edx // 2nd tenbits
339 movl %edx, -8(%rcx) // write 2nd tenbits
340 shrl $10, %eax // w >> 20, 3rd tenbits
341 movl %eax, -4(%rcx) // write 3rd tenbits
342 addq $4, %r9 // next_word++
343 addq $12, %rcx // next_low_bits += 3
344 cmpq %r9, %rdi // LOW_BITS_AREA_END vs next_word
345 ja L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word
347 movq %rsi, %rdi // next_tag
348 movq %rbx, %r8 // next_qpos
349 leaq 4(%r12), %rcx // dest_buf
350 movq %r11, %r9 // next_low_bits
351 leaq -80(%rbp), %r11 // dictionary
352 leaq _hashLookupTable(%rip), %rbx // hash look up table
353 leaq 1024(%rsi), %rsi // tag_area_end
359 cmpb $2, %al // MISS_TAG
362 movsbq (%r8),%rax // qpos = *next_qpos
363 incq %r8 // next_qpos++
364 movl (%r11,%rax,4), %eax // w = dictionary[qpos]
365 movl %eax, -4(%rcx) // *dest_buf = w
368 incq %rdi // next_tag++
369 addq $4, %rcx // dest_buf++
370 cmpq %rsi, %rdi // next_tag vs tag_area_end
371 jae L_done // if next_tag >= tag_area_end, we're done
373 movzbl (%rdi), %eax // tag = *next_tag
374 cmpb $1, %al // partial match tag ?
377 movsbq (%r8),%rdx // qpos = *next_qpos
378 leaq (%r11,%rdx,4), %rdx // dict_location = &dictionary[qpos]
379 incq %r8 // next_qpos++
380 movl (%rdx), %eax // read dictionary word
381 andl $-1024, %eax // clear lower 10 bits
382 orl (%r9), %eax // pad the lower 10-bits from *next_low_bits
383 addq $4, %r9 // next_low_bits++
384 movl %eax, (%rdx) // *dict_location = newly formed word
385 movl %eax, -4(%rcx) // *dest_buf = newly formed word
386 cmpq %rsi, %rdi // compare next_tag vs tag_area_end
387 jne L_next // repeat loop until next_tag==tag_area_end
390 // release stack memory, restore registers, and return
399 movl (%r10), %eax // w = *next_full_patt
400 addq $4, %r10 // next_full_patt++
402 shrl $10, %edx // w>>10
403 movzbl %dl, %edx // 8-bit hash table index
404 movsbq (%rbx,%rdx),%rdx // qpos
405 movl %eax, -80(%rbp,%rdx) // dictionary[qpos] = word
406 movl %eax, -4(%rcx) // *dest_buf = word
407 jmp L_next // repeat the loop
411 movl $0, -4(%rcx) // *dest_buf = 0
412 jmp L_next // repeat the loop
416 .globl _hashLookupTable