1 // $Id: WKdmCompress.intel.s,v 1.1 2010/01/28 22:33:24 cclee Exp cclee $
3 // This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Compressor. The function prototype is
5 // unsigned int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, unsigned int num_input_words);
7 // The implementation assumes the input buffer is a memory page (4096 bytes or 1024 words), or something less than 4KB.
9 // WKdm Compression algorithm is briefly stated as follows:
11 // There is a dynamically updated dictionary of 16 words, each initialized with "1".
13 // the dictionary is indexed as follows,
15 // 1, hash_index = (x>>10)&255
16 // 2, dict_location = &dictionary[hash_index]
17 // 3, dict_word = *dict_location
19 // Sequentially for each input word, it is classified/tagged into 4 classes
20 // 0 : if the input word is 0
21 // 1 : the higher 22 bits of the input word is identically to the higher bits from the dictionary (hash table indexed)
22 // 2 : the above condition (partially 22 higher bits matched) is not met, a dictionary miss condition
23 // 3 : the input word is exactly matched to the word from the dictionary (hash table index)
25 // after each input word is classified, each tag is represented by 2 bits. Furthermore, for each class
26 // 0 : no further info is needed
27 // 1 : the hash_index is represented by 4-bits (8 packed into a word),
28 // the lower 10-bits is sent to the decompressor (3 packed into a word)
29 // 2 : the 32-bit word is sent to the decompressor
30 // 3 : the hash_index is represented by 4-bits (8 packed into a word)
32 // for classes 1 and 2, the input word is used to update the dictionary after it is classified/tagged
34 // the following implementation was started from compiling (gcc -O3) the original C code (WKdmCompress.c)
35 // and then subsequentially improved and documented.
36 // For i386, it speeds up ~ 1.5 times
37 // For x86_64, it speeds up ~ 1.3 times
41 #if !(defined __i386__ || defined __x86_64__)
43 typedef char DummyDefinition;
45 #else // i386 or x86_64 architectures
47 #if defined __i386__ // 32-bit implementation
62 // allocate stack memory for local variables
66 leal _hashLookupTable, %ebx // hashTable
68 movl 8(%ebp), %edx // %edx = src_buf
69 movl 12(%ebp), %esi // %esi = dest_buf
70 movl 16(%ebp), %eax // %eax = num_input_words
72 leal -1112(%ebp), %ecx // tempTagsArray
73 movl %ecx, -6272(%ebp) // a copy of char* next_tag = (char *) tempTagsArray;
75 leal -2136(%ebp), %ecx // tempQPosArray
76 movl %ecx, -6264(%ebp) // char* next_qp = (char *) tempQPosArray;
77 movl %ecx, -6252(%ebp)
79 leal (%edx,%eax,4), %ecx // src_buf + num_input_words*4
80 movl %ecx, -6244(%ebp) // end_of_input = src_buf + num_input_words;
82 // PRELOAD_DICTIONARY;
100 shrl $4, %eax // (num_input_words / 16)
101 leal 16(%esi,%eax,4), %eax // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
102 movl %eax, -6256(%ebp) // next_full_patt = dest_buf + TAGS_AREA_OFFSET + (num_input_words / 16);
104 leal -6232(%ebp), %eax // &tempLowBitsArray[0]
105 movl %eax, -6260(%ebp) // save a copy of &tempLowBitsArray[0]
106 movl %eax, -6248(%ebp) // save a copy of &tempLowBitsArray[0]
108 cmpl %ecx, %edx // next_input_word (%edx) vs end_of_input (%ecx)
109 jae L_done_search // if (next_input_word >= end_of_input) skip the following search loop
111 leal -1111(%ebp), %esi // &next_tag[1]
112 leal -88(%ebp), %ebp // dictionary
114 movl %edx, %edi // next_input_word
116 #define next_input_word %edi
117 #define dictionary %ebp
118 #define next_tag %esi
124 movb $0, -1(next_tag) // *next_tag = ZERO;
126 addl $4, next_input_word // next_input_word++;
127 incl next_tag // next_tag++
128 cmpl next_input_word, 84(%esp) // end_of_input vs next_input_word
129 jbe L_done_search // if (next_input_word>=end_of_input), skip to L_done_search
131 movl (next_input_word), %ecx // input_word = *next_input_word;
132 movl %ecx, %eax // a copy of input_word
133 testl %ecx, %ecx // input_word
134 je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
135 shrl $10, %eax // input_high_bits = HIGH_BITS(input_word);
136 movl %eax, (%esp) // save a copy of input_high_bits;
137 andl $255, %eax // 8 bits index to Hash Table
138 movsbl (%ebx,%eax),%edx // HASH_TO_DICT_BYTE_OFFSET(input_word)
139 addl dictionary, %edx // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
140 movl (%edx), %eax // dict_word = *dict_location;
141 cmpl %eax, %ecx // cmp input_word vs dict_word
143 shrl $10, %eax // HIGH_BITS(dict_word)
144 cmpl %eax, (%esp) // input_high_bits vs HIGH_BITS(dict_word)
145 je L_RECORD_PARTIAL // if (input_high_bits == HIGH_BITS(dict_word)) RECORD_PARTIAL
148 movb $2, -1(next_tag) // *next_tag = 2 for miss
149 movl 72(%esp), %eax // next_full_patt
150 movl %ecx, (%eax) // *next_full_patt = input_word;
151 addl $4, %eax // next_full_patt++;
152 movl %eax, 72(%esp) // save next_full_patt
153 movl %ecx, (%edx) // *dict_location = input_word
158 movb $3, -1(next_tag) // *next_tag = 3 for exact
159 subl dictionary, %edx // dict_location - dictionary
160 sarl $2, %edx // divide by 4 for word offset
161 movl 76(%esp), %eax // next_qp
162 movb %dl, (%eax) // *next_qp = word offset (4-bit)
163 incl %eax // next_qp++
164 movl %eax, 76(%esp) // save next_qp
169 // restore %ebp as normal use (was used as dictionary)
173 // SET_QPOS_AREA_START(dest_buf,next_full_patt);
174 movl -6256(%ebp), %edi // next_full_patt
175 subl 12(%ebp), %edi // next_full_patt - dest_buf
176 movl %edi, %eax // next_full_patt - dest_buf
177 sarl $2, %eax // in 4-byte words
178 movl %eax, -6240(%ebp) // save (next_full_patt - dest_buf) in words
179 movl 12(%ebp), %edx // dest_buf
180 movl %eax, 4(%edx) // dest_buf[1] = next_full_patt - dest_buf
182 movl -6272(%ebp), %ecx // &tempTagsArray[0]
184 cmpl next_tag, %ecx // next_tag vs &tempTagsArray[0]
185 jae L13 // if &tempTagsArray[0] >= next_tag, skip the following WK_pack_2bits
187 movl %edx, %ebx // a copy of dest_buf
189 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
193 movl 4(%ecx), %eax // w1
194 sall $2, %eax // w1 << 2
195 movl 8(%ecx), %edx // w2
196 sall $4, %edx // w2 << 4
197 orl %edx, %eax // (w1<<2) | (w2<<4)
198 orl (%ecx), %eax // (w0) | (w1<<2) | (w2<<4)
199 movl 12(%ecx), %edx // w3
200 sall $6, %edx // (w3<<6)
201 orl %edx, %eax // (w0) | (w1<<2) | (w2<<4) | (w3<<6)
202 movl %eax, 16(%ebx) // save at *(dest_buf + HEADER_SIZE_IN_WORDS)
203 addl $16, %ecx // tempTagsArray += 16;
204 addl $4, %ebx // dest_buf += 4;
205 cmpl %ecx, next_tag // cmp next_tag vs dest_buf
206 ja L_WK_pack_2bits // if (next_tag > dest_buf) repeat L_WK_pack_2bits
208 /* Pack the queue positions into the area just after the full words. */
210 movl -6252(%ebp), %eax // next_qp
211 movl -6264(%ebp), %ecx // (char *) tempQPosArray
212 movl %eax, %esi // next_qp
213 subl %ecx, %eax // num_bytes_to_pack = next_qp - (char *) tempQPosArray;
214 addl $7, %eax // num_bytes_to_pack + 7
215 andl $-8, %eax // clear lower 3 bits, (num_packed_words<<3)
216 addl %eax, %ecx // endQPosArray = tempQPosArray + num_source_words;
217 cmpl %ecx, %esi // next_qp vs endQPosArray
221 movb $0, (%esi) // *next_qp = 0;
222 incl %esi // next_qp++
223 cmpl %ecx, %esi // next_qp vs endQPosArray
227 movl -6256(%ebp), %ebx // next_full_patt
228 cmpl -6264(%ebp), %ecx // endQPosArray vs tempQPosArray
229 jbe L20 // if (endQPosArray<=tempQPosArray) skip L_WK_pack_4bits
230 movl -6264(%ebp), %edx // tempQPosArray
233 // boundary_tmp = WK_pack_4bits(tempQPosArray, endQPosArray, next_full_patt);
237 movl 4(%edx), %eax // src_next[1]
238 sall $4, %eax // (src_next[1] << 4)
239 orl (%edx), %eax // temp = src_next[0] | (src_next[1] << 4)
240 movl %eax, (%ebx) // dest_next[0] = temp;
241 addl $4, %ebx // dest_next++;
242 addl $8, %edx // src_next += 2;
243 cmpl %edx, %ecx // source_end vs src_next
244 ja L21 // while (src_next < source_end) repeat the loop
246 movl %ebx, %edi // boundary_tmp
248 subl 12(%ebp), %edi // boundary_tmp - dest_buf
249 movl %edi, %eax // boundary_tmp - dest_buf
250 sarl $2, %eax // translate into word offset
252 movl %eax, -6240(%ebp) // save (next_full_patt - dest_buf) in words
255 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
256 movl -6240(%ebp), %ecx // boundary_tmp - dest_buf
257 movl 12(%ebp), %edx // dest_buf
258 movl %ecx, 8(%edx) // dest_buf[2] = boundary_tmp - dest_buf
260 movl -6260(%ebp), %ecx // tempLowBitsArray
261 movl -6248(%ebp), %edx // next_low_bits
262 subl %ecx, %edx // next_low_bits - tempLowBitsArray
263 sarl $2, %edx // num_tenbits_to_pack
265 subl $3, %edx // pre-decrement num_tenbits_to_pack by 3
266 jl 1f // if num_tenbits_to_pack < 3, skip the following loop
269 movl 4(%ecx), %eax // w1
270 sall $10, %eax // w1<<10
271 movl 8(%ecx), %esi // w2
272 sall $20, %esi // w2<<20
273 orl %esi, %eax // (w1<<10) | (w2<<20)
274 orl (%ecx), %eax // (w0) | (w1<<10) | (w2<<20)
275 movl %eax, (%ebx) // pack w0,w1,w2 into 1 dest_buf word
276 addl $4, %ebx // dest_buf++
277 addl $12, %ecx // next w0/w1/w2 triplet
278 subl $3, %edx // num_tenbits_to_pack-=3
279 jge 0b // if no less than 3 elements, back to loop head
281 1: addl $3, %edx // post-increment num_tenbits_to_pack by 3
282 je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
283 movl (%ecx), %eax // w0
284 subl $1, %edx // num_tenbits_to_pack --
286 movl 4(%ecx), %esi // w1
287 sall $10, %esi // w1<<10
290 movl %eax, (%ebx) // write the final dest_buf word
291 addl $4, %ebx // dest_buf++
293 movl %ebx, %eax // boundary_tmp
294 subl 12(%ebp), %eax // boundary_tmp - dest_buf
295 sarl $2, %eax // boundary_tmp - dest_buf in terms of words
296 movl 12(%ebp), %esi // dest_buf
297 movl %eax, 12(%esi) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp);
298 sall $2, %eax // boundary_tmp - dest_buf in terms of bytes
299 addl $6316, %esp // pop out stack memory
309 movb $1, -1(next_tag) // *next_tag = 1 for partial matched
310 movl %edx, %eax // dict_location
311 subl dictionary, %eax // %eax = dict_location - dictionary
312 movl %ecx, (%edx) // *dict_location = input_word;
313 sarl $2, %eax // offset in 32-bit word
314 movl 76(%esp), %edx // next_qp
315 movb %al, (%edx) // update *next_qp
316 incl %edx // next_qp++
317 movl %edx, 76(%esp) // save next_qp
318 movl %ecx, %eax // a copy of input_word
319 andl $1023, %eax // lower 10 bits
320 movl 80(%esp), %edx // next_low_bits
321 movl %eax, (%edx) // EMIT_WORD(next_low_bits,(low_bits_pattern))
322 addl $4, %edx // next_low_bits++
323 movl %edx, 80(%esp) // save next_low_bits
326 #endif // i386 architectures
328 #if defined __x86_64__ // 64-bit implementation
332 .globl _WKdm_compress
343 #define tempTagsArray -6264(%rbp)
344 #define tempLowBitsArray -6272(%rbp)
346 #define next_input_word %rdi
347 #define end_of_input %r13
348 #define next_full_patt %rbx
349 #define dict_location %rcx
351 #define dictionary %r11
352 #define dest_buf %r12
353 #define hashTable %r14
354 #define tempQPosArray %r15
355 #define next_low_bits %rsi
357 movq %rsi, %r12 // dest_buf
359 leaq -1136(%rbp), %rax // &tempTagsArray[0]
360 movq %rax, tempTagsArray
361 leaq 1(%rax), next_tag // next_tag always points to the one following the current tag
363 leaq -2160(%rbp), %r15 // &tempQPosArray[0]
364 movq %r15, next_qp // next_qp
366 mov %edx, %eax // num_input_words
367 leaq (%rdi,%rax,4), end_of_input // end_of_input = src_buf + num_input_words
369 // PRELOAD_DICTIONARY;
387 shrl $4, %edx // (num_input_words / 16)
388 mov %edx, %edx // sign extension into quad word
389 leaq 16(%rsi,%rdx,4), %rbx // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
391 leaq -6256(%rbp), %rax // &tempLowBitsArray[0]
392 movq %rax, tempLowBitsArray // save for later reference
393 movq %rax, next_low_bits // next_low_bits
395 cmpq end_of_input, next_input_word // next_input_word vs end_of_input
396 jae L_done_search // if (next_input_word>=end_of_input) no work to do in search
397 leaq -112(%rbp), dictionary // dictionary
398 leaq _hashLookupTable(%rip), hashTable // hash look up table
403 movb $0, -1(next_tag) // *next_tag = ZERO;
405 addq $4, next_input_word // next_input_word++;
406 incq next_tag // next_tag++
407 cmpq next_input_word, end_of_input // end_of_input vs next_input_word
410 movl (next_input_word), %edx // input_word = *next_input_word;
411 movl %edx, %r9d // a copy of input_word
412 testl %edx, %edx // input_word
413 je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
414 shrl $10, %r9d // input_high_bits = HIGH_BITS(input_word);
415 movzbl %r9b, %eax // 8-bit index to the Hash Table
416 movsbq (hashTable,%rax),%rax // HASH_TO_DICT_BYTE_OFFSET(input_word)
417 leaq (dictionary, %rax), dict_location // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
418 movl (dict_location), %eax // dict_word = *dict_location;
419 cmpl %eax, %edx // dict_word vs input_word
420 je L_RECORD_EXACT // if identical, RECORD_EXACT
421 shrl $10, %eax // HIGH_BITS(dict_word)
422 cmpl %eax, %r9d // input_high_bits vs HIGH_BITS(dict_word)
423 je L_RECORD_PARTIAL // if identical, RECORD_PARTIAL
426 movb $2, -1(next_tag) // *next_tag = 2 for miss
427 movl %edx, (next_full_patt) // *next_full_patt = input_word;
428 addq $4, next_full_patt // next_full_patt++
429 movl %edx, (dict_location) // *dict_location = input_word
430 addq $4, next_input_word // next_input_word++
431 incq next_tag // next_tag++
432 cmpq next_input_word, end_of_input // end_of_input vs next_input_word
433 ja L5 // if (end_of_input>next_input_word) repeat from L5
437 // SET_QPOS_AREA_START(dest_buf,next_full_patt);
438 //movq next_full_patt, %r11 // next_full_patt
439 movq next_full_patt, %rax // next_full_patt
440 subq dest_buf, %rax // next_full_patt - dest_buf
441 sarq $2, %rax // offset in 4-bytes
442 movl %eax, %r13d // r13d = (next_full_patt - dest_buf)
443 movl %eax, 4(dest_buf) // dest_buf[1] = next_full_patt - dest_buf
446 cmpq next_tag, tempTagsArray // &tempTagsArray[0] vs next_tag
447 jae L13 // if (&tempTagsArray[0] >= next_tag), skip the following
449 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
451 movq dest_buf, %rdi // dest_buf
452 movq tempTagsArray, %rcx // &tempTagsArray[0]
456 movl 4(%rcx), %eax // w1
457 sall $2, %eax // w1 << 2
458 movl 8(%rcx), %edx // w2
459 sall $4, %edx // w2 << 4
460 orl %edx, %eax // (w1<<2) | (w2<<4)
461 orl (%rcx), %eax // (w0) | (w1<<2) | (w2<<4)
462 movl 12(%rcx), %edx // w3
463 sall $6, %edx // w3 << 6
464 orl %edx, %eax // (w0) | (w1<<2) | (w2<<4) | (w3<<6)
465 movl %eax, 16(%rdi) // save at *(dest_buf + HEADER_SIZE_IN_WORDS)
466 addq $16, %rcx // tempTagsArray += 16;
467 addq $4, %rdi // dest_buf += 4;
468 cmpq %rcx, next_tag // cmp next_tag vs dest_buf
469 ja L_pack_2bits // if (next_tag > dest_buf) repeat L_pack_2bits
471 /* Pack the queue positions into the area just after the full words. */
474 movl %r10d, %eax // next_qp
475 subl %r15d, %eax // num_bytes_to_pack = next_qp - (char *) tempQPosArray;
476 addl $7, %eax // num_bytes_to_pack+7
477 shrl $3, %eax // num_packed_words = (num_bytes_to_pack + 7) >> 3
478 addl %eax, %eax // num_source_words = num_packed_words * 2;
480 leaq (tempQPosArray,%rax,4), %rcx // endQPosArray = tempQPosArray + num_source_words
481 cmpq %rcx, %r10 // next_qp vs endQPosArray
482 jae L16 // if (next_qp >= endQPosArray) skip the following zero paddings
485 movb $0, (next_qp) // *next_qp = 0
486 incq next_qp // next_qp++
487 cmpq %rcx, next_qp // next_qp vs endQPosArray
488 jne L30 // repeat while next_qp < endQPosArray
490 movq %rbx, %rdi // next_full_patt
491 cmpq tempQPosArray, %rcx // endQPosArray vs tempQPosArray
492 jbe L20 // if (endQPosArray <= tempQPosArray) skip the following
493 movq tempQPosArray, %rdx // tempQPosArray
497 movl 4(%rdx), %eax // src_next[1]
498 sall $4, %eax // (src_next[1] << 4)
499 orl (%rdx), %eax // temp = src_next[0] | (src_next[1] << 4)
500 movl %eax, (%rdi) // dest_next[0] = temp;
501 addq $4, %rdi // dest_next++;
502 addq $8, %rdx // src_next += 2;
503 cmpq %rdx, %rcx // source_end vs src_next
504 ja L_pack_4bits // while (src_next < source_end) repeat the loop
506 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
507 //movq %rdi, %r11 // boundary_tmp
508 movq %rdi, %rax // boundary_tmp
509 subq dest_buf, %rax // boundary_tmp - dest_buf
510 movq %rax, %r13 // boundary_tmp - dest_buf
511 shrq $2, %r13 // boundary_tmp - dest_buf in words
513 movl %r13d, 8(dest_buf) // dest_buf[2] = boundary_tmp - dest_buf
515 movq tempLowBitsArray, %rcx // tempLowBitsArray
516 movq next_low_bits, %rbx // next_low_bits
517 subq %rcx, %rbx // next_low_bits - tempLowBitsArray (in bytes)
518 sarq $2, %rbx // num_tenbits_to_pack (in words)
522 subl $3, size // pre-decrement num_tenbits_to_pack by 3
523 jl 1f // if num_tenbits_to_pack < 3, skip the following loop
527 movl 4(%rcx), %eax // w1
528 sall $10, %eax // w1 << 10
529 movl 8(%rcx), %edx // w2
530 sall $20, %edx // w2 << 20
531 orl %edx, %eax // (w1<<10) | (w2<<20)
532 orl (%rcx), %eax // (w0) | (w1<<10) | (w2<<20)
533 movl %eax, (%rdi) // pack w0,w1,w2 into 1 dest_buf word
534 addq $4, %rdi // dest_buf++
535 addq $12, %rcx // next w0/w1/w2 triplet
536 subl $3, size // num_tenbits_to_pack-=3
537 jge 0b // if no less than 3 elements, back to loop head
539 1: addl $3, size // post-increment num_tenbits_to_pack by 3
540 je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
541 movl (%rcx), %eax // w0
542 subl $1, size // num_tenbits_to_pack--
544 movl 4(%rcx), %edx // w1
545 sall $10, %edx // w1 << 10
546 orl %edx, %eax // w0 | (w1<<10)
548 2: movl %eax, (%rdi) // write the final dest_buf word
549 addq $4, %rdi // dest_buf++
551 3: movq %rdi, %rax // boundary_tmp
552 subq dest_buf, %rax // boundary_tmp - dest_buf
553 shrq $2, %rax // boundary_tmp - dest_buf in terms of words
554 movl %eax, 12(dest_buf) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
555 shlq $2, %rax // boundary_tmp - dest_buf in terms of bytes
557 // restore registers and return
569 movb $3, -1(next_tag) // *next_tag = 3 for exact
570 subq dictionary, %rcx // dict_location - dictionary
571 sarq $2, %rcx // divide by 4 for word offset
572 movb %cl, (next_qp) // *next_qp = word offset (4-bit)
573 incq next_qp // next_qp++
578 movb $1, -1(next_tag) // *next_tag = 1 for partial matched
579 movq %rcx, %rax // dict_location
580 subq dictionary, %rax // dict_location - dictionary
581 movl %edx, (%rcx) // *dict_location = input_word;
582 sarq $2, %rax // offset in 32-bit word
583 movb %al, (next_qp) // update *next_qp
584 incq next_qp // next_qp++
585 andl $1023, %edx // lower 10 bits
586 movl %edx, (next_low_bits) // save next_low_bits
587 addq $4, next_low_bits // next_low_bits++
590 // for some reason, keeping the following never executed code yields a better performance
592 leaq -6256(%rbp), %rax
593 movq %rax, -6272(%rbp)
596 #endif // x86_64 architectures
597 #endif // i386 or x86_64 architectures