]> git.saurik.com Git - apple/xnu.git/blame - osfmk/x86_64/WKdmCompress_new.s
xnu-3248.40.184.tar.gz
[apple/xnu.git] / osfmk / x86_64 / WKdmCompress_new.s
CommitLineData
39236c6e
A
1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains x86_64 hand optimized implementation of WKdm memory page compressor.
31
32 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
33
34 input :
35 src_buf : address of input page (length = 1024 words)
36 dest_buf : address of output buffer (may not be 16-byte aligned)
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
38 bytes_budget : a given byte target in compression
39
40 output :
41
42 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
43 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
44 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
45 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
46 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
47
48 The 4th argument bytes_budget is the target compress budget in bytes.
49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
51
52 WKdm Compression algorithm is briefly stated as follows:
53
54 There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
55 For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
56 a. k = (x>>10)&255; // 8-bit hash table index
57 b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed
58 c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated
59
60 Each input word x is classified/tagged into 4 classes :
61 0 : x = 0
62 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
63 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
64 3 : (x == dict_word), the exact input word is in the dictionary
65
66 For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
67 0 : 2-bits tag (32->2 compression)
68 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
69 2 : 2-bits tag + 32-bits new word (32->34 expansion)
70 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
71
72 It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
73
74 the output bit stream (*dest_buf) consists of
75 a. 12 bytes header
76 b. 256 bytes for 1024 packed tags
77 c. (varying number of) words for new words not matched to dictionary word.
78 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
79 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
80
81 the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
82 Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
83
84
85 The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
86
87 The temp buffers are
88
89 uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing
90 uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing
91 uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing
92
93 Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
94 the destination buffer.
95
96 uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags.
97
98 Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass.
99
100 bytes_budget -= 12 + 256; // header and tags (1024 * 2 /8 = 256 bytes)
101
102 whenever an input word is classified as class
103
104 2 : bytes_budget-=4; if (bytes_budget<=0) exit -1;
105
106 when writing the 8 4-bits/3 10-bits, monitor bytes_budget and exit -1 when byte_budget <=0;
107
108 without showing the bit budget management, the pseudo code is given as follows:
109
110 uint8_t *tags=tempTagsArray;
111 uint8_t *dict=tempQPosArray;
112 uint8_t *partial=tempLowBitsArray;
113
114 for (i=0;i<1024;i++) {
115 x = *src_buf++;
116 if (x == 0) { // zero, 2-bits tag
117 *tags++ = 0;
118 } else {
119
120 // find dict_index and dict_word from x
121 k = (x>>10)&255;
122 dict_index = hashTable[k];
123 dict_word = dictionary[dict_index];
124
125 if (dict_word == x) { // exactly match
126 // 2-bits tag + 4-bits table index
127 *tags++ = 3;
128 *dict++ = dict_index;
129 } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched
130 // 2-bits tag + 4-bits table index + 10-bits lower partial
131 *tags++ = 1;
132 *dict++ = dict_index;
133 *partial++ = x &0x3ff;
134 dictionary[dict_index] = x;
135 } else { // not matched
136 // 2-bits tag + 32-bits new word
137 *tags++ = 2;
138 *new_word++ = x;
139 dictionary[dict_index] = x;
140 }
141 }
142 }
143
144 after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
145
146 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
147 2. dictionary indices (4-bits each) are packed into are right after the new words section
148 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
149
150 cclee, 11/30/12
3e170ce0
A
151
152 Added zero page, single value page, sparse page, early abort optimizations
153 rsrini, 09/14/14
154
39236c6e
A
155*/
156
157 .text
158 .align 4,0x90
159
3e170ce0
A
160#define SV_RETURN $0 // return value when SV, ZV page is found
161#define MZV_MAGIC $17185 // magic value used to identify MZV page encoding
162#define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
163#define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data
164#define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing ..
165 // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
166
167#if CHKPT_BYTES > 4096
168 #error CHKPT_BYTES must be <= 4096
169#endif
170#if CHKPT_BYTES < 4
171 #error CHKPT_BYTES must be >= 4
172#endif
173
39236c6e
A
174.globl _WKdm_compress_new
175_WKdm_compress_new:
176 pushq %rbp
177 movq %rsp, %rbp
178 pushq %r15
179 pushq %r14
180 pushq %r13
181 pushq %r12
182 pushq %rbx
3e170ce0 183 subq $(48+64), %rsp
39236c6e 184
3e170ce0 185 #define tempTagsArray 64(%rsp)
39236c6e 186 #define tempLowBitsArray 72(%rsp)
3e170ce0
A
187
188 #define start_next_full_patt 80(%rsp)
189 #define start_next_input_word 88(%rsp)
190 #define byte_budget 96(%rsp)
191 #define start_next_qp tempQPosArray
192 #define start_next_low_bits tempLowBitsArray
193
39236c6e
A
194 #define next_tag %r8
195 #define next_input_word %rdi
196 #define end_of_input %r13
197 #define next_full_patt %rbx
198 #define dict_location %rcx
199 #define next_qp %r10
3e170ce0 200 #define checkpoint %r11
39236c6e 201 #define dictionary %rsp
39236c6e
A
202 #define dest_buf %r12
203 #define hashTable %r14
204 #define tempQPosArray %r15
205 #define next_low_bits %rsi
206 #define byte_count %r9d
207
208 movq %rsi, %r12 // dest_buf
39236c6e
A
209
210 movq %rdx, tempTagsArray // &tempTagsArray[0]
211 movq %rdx, next_tag // next_tag always points to the one following the current tag
212
213 leaq 1024(%rdx), tempQPosArray // &tempQPosArray[0]
214 movq tempQPosArray, next_qp // next_qp
215
3e170ce0 216 leaq CHKPT_BYTES(%rdi), checkpoint // checkpoint = src_buf + CHKPT_BYTES
39236c6e
A
217 leaq 4096(%rdi), end_of_input // end_of_input = src_buf + num_input_words
218 leaq 268(%rsi), %rbx // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
219
220 movl %ecx, byte_count
221 subl $(12+256), byte_count // header + tags
222 jle L_budgetExhausted
223
3e170ce0
A
224 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
225 // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
39236c6e 226 // PRELOAD_DICTIONARY;
3e170ce0
A
227 movl $0, 0(dictionary)
228 movl $0, 4(dictionary)
229 movl $0, 8(dictionary)
230 movl $0, 12(dictionary)
231 movl $0, 16(dictionary)
232 movl $0, 20(dictionary)
233 movl $0, 24(dictionary)
234 movl $0, 28(dictionary)
235 movl $0, 32(dictionary)
236 movl $0, 36(dictionary)
237 movl $0, 40(dictionary)
238 movl $0, 44(dictionary)
239 movl $0, 48(dictionary)
240 movl $0, 52(dictionary)
241 movl $0, 56(dictionary)
242 movl $0, 60(dictionary)
39236c6e
A
243
244 leaq 2048(%rdx), %rax // &tempLowBitsArray[0]
245 movq %rax, tempLowBitsArray // save for later reference
246 movq %rax, next_low_bits // next_low_bits
247
248 leaq _hashLookupTable_new(%rip), hashTable // hash look up table
3e170ce0
A
249
250 movq next_full_patt, start_next_full_patt
251 movq next_input_word, start_next_input_word
252 movl %ecx, byte_budget // save the byte budget
253
254
39236c6e
A
255 jmp L_scan_loop
256
257 .align 4,0x90
258L_RECORD_ZERO:
259 movb $0, -1(next_tag) // *next_tag = ZERO;
260 addq $4, next_input_word // next_input_word++;
3e170ce0
A
261 cmpq next_input_word, checkpoint // checkpoint time?
262 je CHECKPOINT
263
39236c6e
A
264L_scan_loop:
265 movl (next_input_word), %edx
266 incq next_tag // next_tag++
267 testl %edx, %edx
268 je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
269 movl %edx, %eax // a copy of input_word
270 shrl $10, %eax // input_high_bits = HIGH_BITS(input_word);
271 movzbl %al, %eax // 8-bit index to the Hash Table
272 movsbq (hashTable,%rax),%rax // HASH_TO_DICT_BYTE_OFFSET(input_word)
273 leaq (dictionary, %rax), dict_location // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
274 movl (dict_location), %eax // dict_word = *dict_location;
275 addq $4, next_input_word // next_input_word++
276 cmpl %eax, %edx // dict_word vs input_word
277 je L_RECORD_EXACT // if identical, RECORD_EXACT
278 xorl %edx, %eax
279 shrl $10, %eax // HIGH_BITS(dict_word)
280 je L_RECORD_PARTIAL // if identical, RECORD_PARTIAL
281
282L_RECORD_MISS:
283 movl %edx, (next_full_patt) // *next_full_patt = input_word;
284 addq $4, next_full_patt // next_full_patt++
285 movl %edx, (dict_location) // *dict_location = input_word
286 movb $2, -1(next_tag) // *next_tag = 2 for miss
287 subl $4, byte_count // fill in a new 4-bytes word
288 jle L_budgetExhausted
3e170ce0
A
289 cmpq next_input_word, checkpoint // checkpoint time?
290 jne L_scan_loop
291 jmp CHECKPOINT
39236c6e
A
292
293L_done_search:
294
295 // SET_QPOS_AREA_START(dest_buf,next_full_patt);
296 movq next_full_patt, %rax // next_full_patt
297 subq dest_buf, %rax // next_full_patt - dest_buf
298 sarq $2, %rax // offset in 4-bytes
299 movl %eax, %r13d // r13d = (next_full_patt - dest_buf)
300 movl %eax, 0(dest_buf) // dest_buf[0] = next_full_patt - dest_buf
301 decq next_tag
302 cmpq next_tag, tempTagsArray // &tempTagsArray[0] vs next_tag
303 jae L13 // if (&tempTagsArray[0] >= next_tag), skip the following
304
305 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
306
307 movq dest_buf, %rdi // dest_buf
308 movq tempTagsArray, %rcx // &tempTagsArray[0]
309
310 .align 4,0x90
311L_pack_2bits:
312 movq 8(%rcx), %rax // w3
313 addq $16, %rcx // tempTagsArray += 16;
314 shlq $4, %rax
315 addq $4, %rdi // dest_buf += 4;
316 orq -16(%rcx), %rax // w3
317 movq %rax, %rdx
318 shrq $30, %rax
319 orl %edx, %eax
320 cmpq %rcx, next_tag // cmp next_tag vs dest_buf
321 movl %eax, 8(%rdi) // save at *(dest_buf + HEADER_SIZE_IN_WORDS)
322 ja L_pack_2bits // if (next_tag > dest_buf) repeat L_pack_2bits
323
324 /* Pack the queue positions into the area just after the full words. */
325
326L13:
327 mov next_qp, %rax // next_qp
328 sub tempQPosArray, %rax // num_bytes_to_pack = next_qp - (char *) tempQPosArray;
329 addl $7, %eax // num_bytes_to_pack+7
330 shrl $3, %eax // num_packed_words = (num_bytes_to_pack + 7) >> 3
331
332 shll $2, %eax // turn into bytes
333 subl %eax, byte_count //
334 jl L_budgetExhausted
335 shrl $1, %eax // num_source_words = num_packed_words * 2;
336
337 leaq (tempQPosArray,%rax,4), %rcx // endQPosArray = tempQPosArray + num_source_words
338 cmpq %rcx, next_qp // next_qp vs endQPosArray
339 jae L16 // if (next_qp >= endQPosArray) skip the following zero paddings
340 movq %rcx, %rax
341 subq next_qp, %rax
342 subl $4, %eax
343 jl 1f
344 .align 4,0x90
3450: movl $0, (next_qp)
346 addq $4, next_qp
347 subl $4, %eax
348 jge 0b
3491: testl $2, %eax
350 je 1f
351 movw $0, (next_qp)
352 addq $2, next_qp
3531: testl $1, %eax
354 je 1f
355 movb $0, (next_qp)
356 addq $1, next_qp
3571:
358L16:
359 movq next_full_patt, %rdi // next_full_patt
360 cmpq tempQPosArray, %rcx // endQPosArray vs tempQPosArray
361 jbe L20 // if (endQPosArray <= tempQPosArray) skip the following
362 movq tempQPosArray, %rdx // tempQPosArray
363
364 /* byte_count -= (rcx - tempQPosArray)/2 */
365
366 .align 4,0x90
367L_pack_4bits:
368 movl 4(%rdx), %eax // src_next[1]
369 addq $8, %rdx // src_next += 2;
370 sall $4, %eax // (src_next[1] << 4)
371 addq $4, %rdi // dest_next++;
372 orl -8(%rdx), %eax // temp = src_next[0] | (src_next[1] << 4)
373 cmpq %rdx, %rcx // source_end vs src_next
374 movl %eax, -4(%rdi) // dest_next[0] = temp;
375 ja L_pack_4bits // while (src_next < source_end) repeat the loop
376
377 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
378 movq %rdi, %rax // boundary_tmp
379 subq dest_buf, %rax // boundary_tmp - dest_buf
380 movq %rax, %r13 // boundary_tmp - dest_buf
381 shrq $2, %r13 // boundary_tmp - dest_buf in words
382L20:
383 movl %r13d, 4(dest_buf) // dest_buf[1] = boundary_tmp - dest_buf
384
385 movq tempLowBitsArray, %rcx // tempLowBitsArray
386 movq next_low_bits, %rbx // next_low_bits
387 subq %rcx, %rbx // next_low_bits - tempLowBitsArray (in bytes)
388 sarq $1, %rbx // num_tenbits_to_pack (in half-words)
389
390 #define size %ebx
391
392 subl $3, size // pre-decrement num_tenbits_to_pack by 3
393 jl 1f // if num_tenbits_to_pack < 3, skip the following loop
394
395 .align 4,0x90
3960:
397 movzwl 4(%rcx), %eax // w2
398 addq $6, %rcx // next w0/w1/w2 triplet
399 sall $10, %eax // w1 << 10
400 or -4(%rcx), %ax // w1
401 addq $4, %rdi // dest_buf++
402 sall $10, %eax // w1 << 10
403 or -6(%rcx), %ax // (w0) | (w1<<10) | (w2<<20)
404 subl $4, byte_count // fill in a new 4-bytes word
405 jle L_budgetExhausted
406 subl $3, size // num_tenbits_to_pack-=3
407 movl %eax, -4(%rdi) // pack w0,w1,w2 into 1 dest_buf word
408 jge 0b // if no less than 3 elements, back to loop head
409
4101: addl $3, size // post-increment num_tenbits_to_pack by 3
411 je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
412 movzwl (%rcx), %eax // w0
413 subl $1, size // num_tenbits_to_pack--
414 je 2f //
415 movzwl 2(%rcx), %edx // w1
416 sall $10, %edx // w1 << 10
417 orl %edx, %eax // w0 | (w1<<10)
4182:
419 subl $4, byte_count // fill in a new 4-bytes word
420 jle L_budgetExhausted
421 movl %eax, (%rdi) // write the final dest_buf word
422 addq $4, %rdi // dest_buf++
423
4243: movq %rdi, %rax // boundary_tmp
425 subq dest_buf, %rax // boundary_tmp - dest_buf
426 shrq $2, %rax // boundary_tmp - dest_buf in terms of words
427 movl %eax, 8(dest_buf) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
428 shlq $2, %rax // boundary_tmp - dest_buf in terms of bytes
429
430L_done:
431 // restore registers and return
3e170ce0 432 addq $(48+64), %rsp
39236c6e
A
433 popq %rbx
434 popq %r12
435 popq %r13
436 popq %r14
437 popq %r15
438 leave
439 ret
440
441 .align 4
442L_budgetExhausted:
443 mov $-1, %rax
444 jmp L_done
445
446
447 .align 4,0x90
448L_RECORD_EXACT:
449 subq dictionary, %rcx // dict_location - dictionary
450 sarq $2, %rcx // divide by 4 for word offset
451 movb $3, -1(next_tag) // *next_tag = 3 for exact
452 movb %cl, (next_qp) // *next_qp = word offset (4-bit)
453 incq next_qp // next_qp++
3e170ce0
A
454 cmpq next_input_word, checkpoint // checkpoint time?
455 jne L_scan_loop
456 jmp CHECKPOINT
39236c6e
A
457
458 .align 4,0x90
459L_RECORD_PARTIAL:
460 movq %rcx, %rax // dict_location
461 movb $1, -1(next_tag) // *next_tag = 1 for partial matched
462 subq dictionary, %rax // dict_location - dictionary
463 movl %edx, (%rcx) // *dict_location = input_word;
464 sarq $2, %rax // offset in 32-bit word
465 movb %al, (next_qp) // update *next_qp
466 andl $1023, %edx // lower 10 bits
467 incq next_qp // next_qp++
468 mov %dx, (next_low_bits) // save next_low_bits
469 addq $2, next_low_bits // next_low_bits++
3e170ce0
A
470 cmpq next_input_word, checkpoint // checkpoint time?
471 jne L_scan_loop
472
473CHECKPOINT:
474
475 cmpq end_of_input, checkpoint // end of buffer or compression ratio check?
476 jne L_check_compression_ratio
477
478L_check_zero_page:
479 // check if any dictionary misses in page
480 cmpq start_next_full_patt, next_full_patt
481 jne L_check_single_value_page
482
483 cmpq start_next_qp, next_qp // check if any partial or exact dictionary matches
484 jne L_check_single_value_page
485
486 mov SV_RETURN, %rax // Magic return value
487 jmp L_done
488
489L_check_single_value_page:
490
491 movq next_full_patt, %rax // get # dictionary misses
492 subq start_next_full_patt, %rax
493 shrq $2, %rax
494
495 movq next_qp, %r11 // get # dictionary hits (exact + partial)
496 subq start_next_qp, %r11
497
498 movq next_low_bits, %r13 // get # dictionary partial hits
499 subq start_next_low_bits, %r13
500 shrq $1, %r13
501
502 movq tempTagsArray, %r14 // get the address of the first tag
503
504 // Single value page if one of the follwoing is true:
505 // partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss)
506 // partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial)
507 //
508 cmpq $0, %r13 // were there 0 partial hits?
509 jne 1f
510
511 cmpq $1023, %r11 // were there 1023 dictionary hits
512 jne 1f
513
514 cmpq $1, %rax // was there exacly 1 dictionary miss?
515 jne 1f
516
517 cmpb $2, 0(%r14) // was the very 1st tag a miss?
518 je L_is_single_value_page
519
5201:
521 cmpq $1, %r13 // was there 1 partial hit?
522 jne L_check_mostly_zero
523
524 cmpq $1024, %r11 // were there 1024 dictionary hits
525 jne L_check_mostly_zero
526
527 cmpb $1, 0(%r14) // was the very 1st tag a partial?
528 jne L_check_mostly_zero
529
530L_is_single_value_page:
531
532 mov SV_RETURN, %rax // Magic return value
533 jmp L_done
534
535L_check_mostly_zero:
536 // how much space will the sparse packer take?
537 addq %r11, %rax // rax += (next_qp - start_next_qp)
538 movq $6, %rdx
539 mulq %rdx // rax *= 6 (i.e. 4 byte word + 2 byte offset)
540 addq $4, %rax // rax += 4 byte for header
541 movq %rax, %r11
542 // how much space will the defaut packer take?
543 movq next_low_bits, %rax
544 subq start_next_low_bits, %rax // get bytes consumed by lower-10 bits
545 movq $1365, %rdx
546 mulq %rdx
547 shrq $11, %rax // rax = 2/3*(next_low_bits - start_next_low_bits)
548 movq next_full_patt, %rdx
549 subq start_next_full_patt, %rdx // get bytes consumed by dictionary misses
550 addq %rdx, %rax // rax += (next_full_patt - start_next_full_patt)
551 movq next_qp, %rdx
552 subq start_next_qp, %rdx
553 shrq $1, %rdx // get bytes consumed by dictionary hits
554 addq %rdx, %rax // rax += (next_qp - start_next_qp)/2
555 addq $(12+256), %rax // rax += bytes taken by the header + tags
556
557 cmpq %r11, %rax // is default packer the better option?
558 jb L_done_search
559
560 cmpl byte_budget, %r11d // can the sparse packer fit into the given budget?
561 ja L_budgetExhausted
562
563L_sparse_packer:
564
565 movl MZV_MAGIC, 0(dest_buf) // header to indicate a sparse packer
566 addq $4, dest_buf
567
568 movq $0, %rdx // rdx = byte offset in src of non-0 word
569 movq start_next_input_word, %r8
5701:
571 movq 0(%r8, %rdx), %rax // rax = read dword
572 testq %rax, %rax // is dword == 0
573 jne 5f
5743:
575 addq $8, %rdx // 8 more bytes have been processed
5764:
577 cmpq $4096, %rdx
578 jne 1b
579 movq %r11, %rax // store the size of the compressed stream
580 jmp L_done
581
5825:
583 testl %eax, %eax // is lower word == 0
584 je 6f
585 movl %eax, 0(dest_buf) // store the non-0 word in the dest buffer
586 mov %dx, 4(dest_buf) // store the byte index
587 addq $6, dest_buf
5886:
589 shrq $32, %rax // get the upper word into position
590 testl %eax, %eax // is upper word == 0
591 je 3b
592 addq $4, %rdx
593 movl %eax, 0(dest_buf) // store the word in the dest buffer
594 mov %dx, 4(dest_buf) // store the byte index
595 addq $6, dest_buf
596 addq $4, %rdx
597 jmp 4b
598
599L_check_compression_ratio:
600
601 movq end_of_input, checkpoint // checkpoint = end of buffer
602
603 movq next_low_bits, %rax
604 subq start_next_low_bits, %rax // get bytes consumed by lower-10 bits
605 movq $1365, %rdx
606 mulq %rdx
607 shrq $11, %rax // rax = 2/3*(next_low_bits - start_next_low_bits)
608
609 movq next_full_patt, %rdx
610 subq start_next_full_patt, %rdx // get bytes consumed by dictionary misses
611 addq %rdx, %rax // rax += (next_full_patt - start_next_full_patt)
612
613 movq next_qp, %rdx
614 subq start_next_qp, %rdx
615 shrq $1, %rdx
616 addq %rdx, %rax // rax += (next_qp - start_next_qp)/2
617
618 addq $CHKPT_TAG_BYTES, %rax // rax += bytes taken by the tags
619 cmpq $CHKPT_SHRUNK_BYTES, %rax
620 ja L_budgetExhausted // compressed size exceeds budget
621 jmp L_scan_loop
39236c6e 622