]>
Commit | Line | Data |
---|---|---|
39236c6e A |
1 | /* |
2 | * Copyright (c) 2000-2013 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | This file contains x86_64 hand optimized implementation of WKdm memory page compressor. | |
31 | ||
32 | int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); | |
33 | ||
34 | input : | |
35 | src_buf : address of input page (length = 1024 words) | |
36 | dest_buf : address of output buffer (may not be 16-byte aligned) | |
37 | scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller, | |
38 | bytes_budget : a given byte target in compression | |
39 | ||
40 | output : | |
41 | ||
42 | if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data | |
43 | o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget. | |
44 | During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area. | |
45 | When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined. | |
46 | The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault. | |
47 | ||
48 | The 4th argument bytes_budget is the target compress budget in bytes. | |
49 | Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes. | |
50 | Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed). | |
51 | ||
52 | WKdm Compression algorithm is briefly stated as follows: | |
53 | ||
54 | There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function. | |
55 | For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits). | |
56 | a. k = (x>>10)&255; // 8-bit hash table index | |
57 | b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed | |
58 | c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated | |
59 | ||
60 | Each input word x is classified/tagged into 4 classes : | |
61 | 0 : x = 0 | |
62 | 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word | |
63 | 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss | |
64 | 3 : (x == dict_word), the exact input word is in the dictionary | |
65 | ||
66 | For each class, different numbers of bits are needed for the decompressor to reproduce the original input word. | |
67 | 0 : 2-bits tag (32->2 compression) | |
68 | 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression) | |
69 | 2 : 2-bits tag + 32-bits new word (32->34 expansion) | |
70 | 3 : 2-bits tag + 4-bits dict_index (32->6 compression) | |
71 | ||
72 | It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6). | |
73 | ||
74 | the output bit stream (*dest_buf) consists of | |
75 | a. 12 bytes header | |
76 | b. 256 bytes for 1024 packed tags | |
77 | c. (varying number of) words for new words not matched to dictionary word. | |
78 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) | |
79 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) | |
80 | ||
81 | the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively. | |
82 | Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word. | |
83 | ||
84 | ||
85 | The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer. | |
86 | ||
87 | The temp buffers are | |
88 | ||
89 | uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing | |
90 | uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing | |
91 | uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing | |
92 | ||
93 | Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to | |
94 | the destination buffer. | |
95 | ||
96 | uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags. | |
97 | ||
98 | Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass. | |
99 | ||
100 | bytes_budget -= 12 + 256; // header and tags (1024 * 2 /8 = 256 bytes) | |
101 | ||
102 | whenever an input word is classified as class | |
103 | ||
104 | 2 : bytes_budget-=4; if (bytes_budget<=0) exit -1; | |
105 | ||
106 | when writing the 8 4-bits/3 10-bits, monitor bytes_budget and exit -1 when byte_budget <=0; | |
107 | ||
108 | without showing the bit budget management, the pseudo code is given as follows: | |
109 | ||
110 | uint8_t *tags=tempTagsArray; | |
111 | uint8_t *dict=tempQPosArray; | |
112 | uint8_t *partial=tempLowBitsArray; | |
113 | ||
114 | for (i=0;i<1024;i++) { | |
115 | x = *src_buf++; | |
116 | if (x == 0) { // zero, 2-bits tag | |
117 | *tags++ = 0; | |
118 | } else { | |
119 | ||
120 | // find dict_index and dict_word from x | |
121 | k = (x>>10)&255; | |
122 | dict_index = hashTable[k]; | |
123 | dict_word = dictionary[dict_index]; | |
124 | ||
125 | if (dict_word == x) { // exactly match | |
126 | // 2-bits tag + 4-bits table index | |
127 | *tags++ = 3; | |
128 | *dict++ = dict_index; | |
129 | } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched | |
130 | // 2-bits tag + 4-bits table index + 10-bits lower partial | |
131 | *tags++ = 1; | |
132 | *dict++ = dict_index; | |
133 | *partial++ = x &0x3ff; | |
134 | dictionary[dict_index] = x; | |
135 | } else { // not matched | |
136 | // 2-bits tag + 32-bits new word | |
137 | *tags++ = 2; | |
138 | *new_word++ = x; | |
139 | dictionary[dict_index] = x; | |
140 | } | |
141 | } | |
142 | } | |
143 | ||
144 | after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf: | |
145 | ||
146 | 1. 1024 tags are packed into 256 bytes right after the 12-bytes header | |
147 | 2. dictionary indices (4-bits each) are packed into are right after the new words section | |
148 | 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section. | |
149 | ||
150 | cclee, 11/30/12 | |
3e170ce0 A |
151 | |
152 | Added zero page, single value page, sparse page, early abort optimizations | |
153 | rsrini, 09/14/14 | |
154 | ||
39236c6e A |
155 | */ |
156 | ||
157 | .text | |
158 | .align 4,0x90 | |
159 | ||
3e170ce0 A |
160 | #define SV_RETURN $0 // return value when SV, ZV page is found |
161 | #define MZV_MAGIC $17185 // magic value used to identify MZV page encoding | |
162 | #define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096] | |
163 | #define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data | |
164 | #define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing .. | |
165 | // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096 | |
166 | ||
167 | #if CHKPT_BYTES > 4096 | |
168 | #error CHKPT_BYTES must be <= 4096 | |
169 | #endif | |
170 | #if CHKPT_BYTES < 4 | |
171 | #error CHKPT_BYTES must be >= 4 | |
172 | #endif | |
173 | ||
39236c6e A |
174 | .globl _WKdm_compress_new |
175 | _WKdm_compress_new: | |
176 | pushq %rbp | |
177 | movq %rsp, %rbp | |
178 | pushq %r15 | |
179 | pushq %r14 | |
180 | pushq %r13 | |
181 | pushq %r12 | |
182 | pushq %rbx | |
3e170ce0 | 183 | subq $(48+64), %rsp |
39236c6e | 184 | |
3e170ce0 | 185 | #define tempTagsArray 64(%rsp) |
39236c6e | 186 | #define tempLowBitsArray 72(%rsp) |
3e170ce0 A |
187 | |
188 | #define start_next_full_patt 80(%rsp) | |
189 | #define start_next_input_word 88(%rsp) | |
190 | #define byte_budget 96(%rsp) | |
191 | #define start_next_qp tempQPosArray | |
192 | #define start_next_low_bits tempLowBitsArray | |
193 | ||
39236c6e A |
194 | #define next_tag %r8 |
195 | #define next_input_word %rdi | |
196 | #define end_of_input %r13 | |
197 | #define next_full_patt %rbx | |
198 | #define dict_location %rcx | |
199 | #define next_qp %r10 | |
3e170ce0 | 200 | #define checkpoint %r11 |
39236c6e | 201 | #define dictionary %rsp |
39236c6e A |
202 | #define dest_buf %r12 |
203 | #define hashTable %r14 | |
204 | #define tempQPosArray %r15 | |
205 | #define next_low_bits %rsi | |
206 | #define byte_count %r9d | |
207 | ||
208 | movq %rsi, %r12 // dest_buf | |
39236c6e A |
209 | |
210 | movq %rdx, tempTagsArray // &tempTagsArray[0] | |
211 | movq %rdx, next_tag // next_tag always points to the one following the current tag | |
212 | ||
213 | leaq 1024(%rdx), tempQPosArray // &tempQPosArray[0] | |
214 | movq tempQPosArray, next_qp // next_qp | |
215 | ||
3e170ce0 | 216 | leaq CHKPT_BYTES(%rdi), checkpoint // checkpoint = src_buf + CHKPT_BYTES |
39236c6e A |
217 | leaq 4096(%rdi), end_of_input // end_of_input = src_buf + num_input_words |
218 | leaq 268(%rsi), %rbx // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4 | |
219 | ||
220 | movl %ecx, byte_count | |
221 | subl $(12+256), byte_count // header + tags | |
222 | jle L_budgetExhausted | |
223 | ||
3e170ce0 A |
224 | // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO |
225 | // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES | |
39236c6e | 226 | // PRELOAD_DICTIONARY; |
3e170ce0 A |
227 | movl $0, 0(dictionary) |
228 | movl $0, 4(dictionary) | |
229 | movl $0, 8(dictionary) | |
230 | movl $0, 12(dictionary) | |
231 | movl $0, 16(dictionary) | |
232 | movl $0, 20(dictionary) | |
233 | movl $0, 24(dictionary) | |
234 | movl $0, 28(dictionary) | |
235 | movl $0, 32(dictionary) | |
236 | movl $0, 36(dictionary) | |
237 | movl $0, 40(dictionary) | |
238 | movl $0, 44(dictionary) | |
239 | movl $0, 48(dictionary) | |
240 | movl $0, 52(dictionary) | |
241 | movl $0, 56(dictionary) | |
242 | movl $0, 60(dictionary) | |
39236c6e A |
243 | |
244 | leaq 2048(%rdx), %rax // &tempLowBitsArray[0] | |
245 | movq %rax, tempLowBitsArray // save for later reference | |
246 | movq %rax, next_low_bits // next_low_bits | |
247 | ||
248 | leaq _hashLookupTable_new(%rip), hashTable // hash look up table | |
3e170ce0 A |
249 | |
250 | movq next_full_patt, start_next_full_patt | |
251 | movq next_input_word, start_next_input_word | |
252 | movl %ecx, byte_budget // save the byte budget | |
253 | ||
254 | ||
39236c6e A |
255 | jmp L_scan_loop |
256 | ||
257 | .align 4,0x90 | |
258 | L_RECORD_ZERO: | |
259 | movb $0, -1(next_tag) // *next_tag = ZERO; | |
260 | addq $4, next_input_word // next_input_word++; | |
3e170ce0 A |
261 | cmpq next_input_word, checkpoint // checkpoint time? |
262 | je CHECKPOINT | |
263 | ||
39236c6e A |
264 | L_scan_loop: |
265 | movl (next_input_word), %edx | |
266 | incq next_tag // next_tag++ | |
267 | testl %edx, %edx | |
268 | je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO | |
269 | movl %edx, %eax // a copy of input_word | |
270 | shrl $10, %eax // input_high_bits = HIGH_BITS(input_word); | |
271 | movzbl %al, %eax // 8-bit index to the Hash Table | |
272 | movsbq (hashTable,%rax),%rax // HASH_TO_DICT_BYTE_OFFSET(input_word) | |
273 | leaq (dictionary, %rax), dict_location // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word)); | |
274 | movl (dict_location), %eax // dict_word = *dict_location; | |
275 | addq $4, next_input_word // next_input_word++ | |
276 | cmpl %eax, %edx // dict_word vs input_word | |
277 | je L_RECORD_EXACT // if identical, RECORD_EXACT | |
278 | xorl %edx, %eax | |
279 | shrl $10, %eax // HIGH_BITS(dict_word) | |
280 | je L_RECORD_PARTIAL // if identical, RECORD_PARTIAL | |
281 | ||
282 | L_RECORD_MISS: | |
283 | movl %edx, (next_full_patt) // *next_full_patt = input_word; | |
284 | addq $4, next_full_patt // next_full_patt++ | |
285 | movl %edx, (dict_location) // *dict_location = input_word | |
286 | movb $2, -1(next_tag) // *next_tag = 2 for miss | |
287 | subl $4, byte_count // fill in a new 4-bytes word | |
288 | jle L_budgetExhausted | |
3e170ce0 A |
289 | cmpq next_input_word, checkpoint // checkpoint time? |
290 | jne L_scan_loop | |
291 | jmp CHECKPOINT | |
39236c6e A |
292 | |
293 | L_done_search: | |
294 | ||
295 | // SET_QPOS_AREA_START(dest_buf,next_full_patt); | |
296 | movq next_full_patt, %rax // next_full_patt | |
297 | subq dest_buf, %rax // next_full_patt - dest_buf | |
298 | sarq $2, %rax // offset in 4-bytes | |
299 | movl %eax, %r13d // r13d = (next_full_patt - dest_buf) | |
300 | movl %eax, 0(dest_buf) // dest_buf[0] = next_full_patt - dest_buf | |
301 | decq next_tag | |
302 | cmpq next_tag, tempTagsArray // &tempTagsArray[0] vs next_tag | |
303 | jae L13 // if (&tempTagsArray[0] >= next_tag), skip the following | |
304 | ||
305 | // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); | |
306 | ||
307 | movq dest_buf, %rdi // dest_buf | |
308 | movq tempTagsArray, %rcx // &tempTagsArray[0] | |
309 | ||
310 | .align 4,0x90 | |
311 | L_pack_2bits: | |
312 | movq 8(%rcx), %rax // w3 | |
313 | addq $16, %rcx // tempTagsArray += 16; | |
314 | shlq $4, %rax | |
315 | addq $4, %rdi // dest_buf += 4; | |
316 | orq -16(%rcx), %rax // w3 | |
317 | movq %rax, %rdx | |
318 | shrq $30, %rax | |
319 | orl %edx, %eax | |
320 | cmpq %rcx, next_tag // cmp next_tag vs dest_buf | |
321 | movl %eax, 8(%rdi) // save at *(dest_buf + HEADER_SIZE_IN_WORDS) | |
322 | ja L_pack_2bits // if (next_tag > dest_buf) repeat L_pack_2bits | |
323 | ||
324 | /* Pack the queue positions into the area just after the full words. */ | |
325 | ||
326 | L13: | |
327 | mov next_qp, %rax // next_qp | |
328 | sub tempQPosArray, %rax // num_bytes_to_pack = next_qp - (char *) tempQPosArray; | |
329 | addl $7, %eax // num_bytes_to_pack+7 | |
330 | shrl $3, %eax // num_packed_words = (num_bytes_to_pack + 7) >> 3 | |
331 | ||
332 | shll $2, %eax // turn into bytes | |
333 | subl %eax, byte_count // | |
334 | jl L_budgetExhausted | |
335 | shrl $1, %eax // num_source_words = num_packed_words * 2; | |
336 | ||
337 | leaq (tempQPosArray,%rax,4), %rcx // endQPosArray = tempQPosArray + num_source_words | |
338 | cmpq %rcx, next_qp // next_qp vs endQPosArray | |
339 | jae L16 // if (next_qp >= endQPosArray) skip the following zero paddings | |
340 | movq %rcx, %rax | |
341 | subq next_qp, %rax | |
342 | subl $4, %eax | |
343 | jl 1f | |
344 | .align 4,0x90 | |
345 | 0: movl $0, (next_qp) | |
346 | addq $4, next_qp | |
347 | subl $4, %eax | |
348 | jge 0b | |
349 | 1: testl $2, %eax | |
350 | je 1f | |
351 | movw $0, (next_qp) | |
352 | addq $2, next_qp | |
353 | 1: testl $1, %eax | |
354 | je 1f | |
355 | movb $0, (next_qp) | |
356 | addq $1, next_qp | |
357 | 1: | |
358 | L16: | |
359 | movq next_full_patt, %rdi // next_full_patt | |
360 | cmpq tempQPosArray, %rcx // endQPosArray vs tempQPosArray | |
361 | jbe L20 // if (endQPosArray <= tempQPosArray) skip the following | |
362 | movq tempQPosArray, %rdx // tempQPosArray | |
363 | ||
364 | /* byte_count -= (rcx - tempQPosArray)/2 */ | |
365 | ||
366 | .align 4,0x90 | |
367 | L_pack_4bits: | |
368 | movl 4(%rdx), %eax // src_next[1] | |
369 | addq $8, %rdx // src_next += 2; | |
370 | sall $4, %eax // (src_next[1] << 4) | |
371 | addq $4, %rdi // dest_next++; | |
372 | orl -8(%rdx), %eax // temp = src_next[0] | (src_next[1] << 4) | |
373 | cmpq %rdx, %rcx // source_end vs src_next | |
374 | movl %eax, -4(%rdi) // dest_next[0] = temp; | |
375 | ja L_pack_4bits // while (src_next < source_end) repeat the loop | |
376 | ||
377 | // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); | |
378 | movq %rdi, %rax // boundary_tmp | |
379 | subq dest_buf, %rax // boundary_tmp - dest_buf | |
380 | movq %rax, %r13 // boundary_tmp - dest_buf | |
381 | shrq $2, %r13 // boundary_tmp - dest_buf in words | |
382 | L20: | |
383 | movl %r13d, 4(dest_buf) // dest_buf[1] = boundary_tmp - dest_buf | |
384 | ||
385 | movq tempLowBitsArray, %rcx // tempLowBitsArray | |
386 | movq next_low_bits, %rbx // next_low_bits | |
387 | subq %rcx, %rbx // next_low_bits - tempLowBitsArray (in bytes) | |
388 | sarq $1, %rbx // num_tenbits_to_pack (in half-words) | |
389 | ||
390 | #define size %ebx | |
391 | ||
392 | subl $3, size // pre-decrement num_tenbits_to_pack by 3 | |
393 | jl 1f // if num_tenbits_to_pack < 3, skip the following loop | |
394 | ||
395 | .align 4,0x90 | |
396 | 0: | |
397 | movzwl 4(%rcx), %eax // w2 | |
398 | addq $6, %rcx // next w0/w1/w2 triplet | |
399 | sall $10, %eax // w1 << 10 | |
400 | or -4(%rcx), %ax // w1 | |
401 | addq $4, %rdi // dest_buf++ | |
402 | sall $10, %eax // w1 << 10 | |
403 | or -6(%rcx), %ax // (w0) | (w1<<10) | (w2<<20) | |
404 | subl $4, byte_count // fill in a new 4-bytes word | |
405 | jle L_budgetExhausted | |
406 | subl $3, size // num_tenbits_to_pack-=3 | |
407 | movl %eax, -4(%rdi) // pack w0,w1,w2 into 1 dest_buf word | |
408 | jge 0b // if no less than 3 elements, back to loop head | |
409 | ||
410 | 1: addl $3, size // post-increment num_tenbits_to_pack by 3 | |
411 | je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following | |
412 | movzwl (%rcx), %eax // w0 | |
413 | subl $1, size // num_tenbits_to_pack-- | |
414 | je 2f // | |
415 | movzwl 2(%rcx), %edx // w1 | |
416 | sall $10, %edx // w1 << 10 | |
417 | orl %edx, %eax // w0 | (w1<<10) | |
418 | 2: | |
419 | subl $4, byte_count // fill in a new 4-bytes word | |
420 | jle L_budgetExhausted | |
421 | movl %eax, (%rdi) // write the final dest_buf word | |
422 | addq $4, %rdi // dest_buf++ | |
423 | ||
424 | 3: movq %rdi, %rax // boundary_tmp | |
425 | subq dest_buf, %rax // boundary_tmp - dest_buf | |
426 | shrq $2, %rax // boundary_tmp - dest_buf in terms of words | |
427 | movl %eax, 8(dest_buf) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp) | |
428 | shlq $2, %rax // boundary_tmp - dest_buf in terms of bytes | |
429 | ||
430 | L_done: | |
431 | // restore registers and return | |
3e170ce0 | 432 | addq $(48+64), %rsp |
39236c6e A |
433 | popq %rbx |
434 | popq %r12 | |
435 | popq %r13 | |
436 | popq %r14 | |
437 | popq %r15 | |
438 | leave | |
439 | ret | |
440 | ||
441 | .align 4 | |
442 | L_budgetExhausted: | |
443 | mov $-1, %rax | |
444 | jmp L_done | |
445 | ||
446 | ||
447 | .align 4,0x90 | |
448 | L_RECORD_EXACT: | |
449 | subq dictionary, %rcx // dict_location - dictionary | |
450 | sarq $2, %rcx // divide by 4 for word offset | |
451 | movb $3, -1(next_tag) // *next_tag = 3 for exact | |
452 | movb %cl, (next_qp) // *next_qp = word offset (4-bit) | |
453 | incq next_qp // next_qp++ | |
3e170ce0 A |
454 | cmpq next_input_word, checkpoint // checkpoint time? |
455 | jne L_scan_loop | |
456 | jmp CHECKPOINT | |
39236c6e A |
457 | |
458 | .align 4,0x90 | |
459 | L_RECORD_PARTIAL: | |
460 | movq %rcx, %rax // dict_location | |
461 | movb $1, -1(next_tag) // *next_tag = 1 for partial matched | |
462 | subq dictionary, %rax // dict_location - dictionary | |
463 | movl %edx, (%rcx) // *dict_location = input_word; | |
464 | sarq $2, %rax // offset in 32-bit word | |
465 | movb %al, (next_qp) // update *next_qp | |
466 | andl $1023, %edx // lower 10 bits | |
467 | incq next_qp // next_qp++ | |
468 | mov %dx, (next_low_bits) // save next_low_bits | |
469 | addq $2, next_low_bits // next_low_bits++ | |
3e170ce0 A |
470 | cmpq next_input_word, checkpoint // checkpoint time? |
471 | jne L_scan_loop | |
472 | ||
473 | CHECKPOINT: | |
474 | ||
475 | cmpq end_of_input, checkpoint // end of buffer or compression ratio check? | |
476 | jne L_check_compression_ratio | |
477 | ||
478 | L_check_zero_page: | |
479 | // check if any dictionary misses in page | |
480 | cmpq start_next_full_patt, next_full_patt | |
481 | jne L_check_single_value_page | |
482 | ||
483 | cmpq start_next_qp, next_qp // check if any partial or exact dictionary matches | |
484 | jne L_check_single_value_page | |
485 | ||
486 | mov SV_RETURN, %rax // Magic return value | |
487 | jmp L_done | |
488 | ||
489 | L_check_single_value_page: | |
490 | ||
491 | movq next_full_patt, %rax // get # dictionary misses | |
492 | subq start_next_full_patt, %rax | |
493 | shrq $2, %rax | |
494 | ||
495 | movq next_qp, %r11 // get # dictionary hits (exact + partial) | |
496 | subq start_next_qp, %r11 | |
497 | ||
498 | movq next_low_bits, %r13 // get # dictionary partial hits | |
499 | subq start_next_low_bits, %r13 | |
500 | shrq $1, %r13 | |
501 | ||
502 | movq tempTagsArray, %r14 // get the address of the first tag | |
503 | ||
504 | // Single value page if one of the follwoing is true: | |
505 | // partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss) | |
506 | // partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial) | |
507 | // | |
508 | cmpq $0, %r13 // were there 0 partial hits? | |
509 | jne 1f | |
510 | ||
511 | cmpq $1023, %r11 // were there 1023 dictionary hits | |
512 | jne 1f | |
513 | ||
514 | cmpq $1, %rax // was there exacly 1 dictionary miss? | |
515 | jne 1f | |
516 | ||
517 | cmpb $2, 0(%r14) // was the very 1st tag a miss? | |
518 | je L_is_single_value_page | |
519 | ||
520 | 1: | |
521 | cmpq $1, %r13 // was there 1 partial hit? | |
522 | jne L_check_mostly_zero | |
523 | ||
524 | cmpq $1024, %r11 // were there 1024 dictionary hits | |
525 | jne L_check_mostly_zero | |
526 | ||
527 | cmpb $1, 0(%r14) // was the very 1st tag a partial? | |
528 | jne L_check_mostly_zero | |
529 | ||
530 | L_is_single_value_page: | |
531 | ||
532 | mov SV_RETURN, %rax // Magic return value | |
533 | jmp L_done | |
534 | ||
535 | L_check_mostly_zero: | |
536 | // how much space will the sparse packer take? | |
537 | addq %r11, %rax // rax += (next_qp - start_next_qp) | |
538 | movq $6, %rdx | |
539 | mulq %rdx // rax *= 6 (i.e. 4 byte word + 2 byte offset) | |
540 | addq $4, %rax // rax += 4 byte for header | |
541 | movq %rax, %r11 | |
542 | // how much space will the defaut packer take? | |
543 | movq next_low_bits, %rax | |
544 | subq start_next_low_bits, %rax // get bytes consumed by lower-10 bits | |
545 | movq $1365, %rdx | |
546 | mulq %rdx | |
547 | shrq $11, %rax // rax = 2/3*(next_low_bits - start_next_low_bits) | |
548 | movq next_full_patt, %rdx | |
549 | subq start_next_full_patt, %rdx // get bytes consumed by dictionary misses | |
550 | addq %rdx, %rax // rax += (next_full_patt - start_next_full_patt) | |
551 | movq next_qp, %rdx | |
552 | subq start_next_qp, %rdx | |
553 | shrq $1, %rdx // get bytes consumed by dictionary hits | |
554 | addq %rdx, %rax // rax += (next_qp - start_next_qp)/2 | |
555 | addq $(12+256), %rax // rax += bytes taken by the header + tags | |
556 | ||
557 | cmpq %r11, %rax // is default packer the better option? | |
558 | jb L_done_search | |
559 | ||
560 | cmpl byte_budget, %r11d // can the sparse packer fit into the given budget? | |
561 | ja L_budgetExhausted | |
562 | ||
563 | L_sparse_packer: | |
564 | ||
565 | movl MZV_MAGIC, 0(dest_buf) // header to indicate a sparse packer | |
566 | addq $4, dest_buf | |
567 | ||
568 | movq $0, %rdx // rdx = byte offset in src of non-0 word | |
569 | movq start_next_input_word, %r8 | |
570 | 1: | |
571 | movq 0(%r8, %rdx), %rax // rax = read dword | |
572 | testq %rax, %rax // is dword == 0 | |
573 | jne 5f | |
574 | 3: | |
575 | addq $8, %rdx // 8 more bytes have been processed | |
576 | 4: | |
577 | cmpq $4096, %rdx | |
578 | jne 1b | |
579 | movq %r11, %rax // store the size of the compressed stream | |
580 | jmp L_done | |
581 | ||
582 | 5: | |
583 | testl %eax, %eax // is lower word == 0 | |
584 | je 6f | |
585 | movl %eax, 0(dest_buf) // store the non-0 word in the dest buffer | |
586 | mov %dx, 4(dest_buf) // store the byte index | |
587 | addq $6, dest_buf | |
588 | 6: | |
589 | shrq $32, %rax // get the upper word into position | |
590 | testl %eax, %eax // is upper word == 0 | |
591 | je 3b | |
592 | addq $4, %rdx | |
593 | movl %eax, 0(dest_buf) // store the word in the dest buffer | |
594 | mov %dx, 4(dest_buf) // store the byte index | |
595 | addq $6, dest_buf | |
596 | addq $4, %rdx | |
597 | jmp 4b | |
598 | ||
599 | L_check_compression_ratio: | |
600 | ||
601 | movq end_of_input, checkpoint // checkpoint = end of buffer | |
602 | ||
603 | movq next_low_bits, %rax | |
604 | subq start_next_low_bits, %rax // get bytes consumed by lower-10 bits | |
605 | movq $1365, %rdx | |
606 | mulq %rdx | |
607 | shrq $11, %rax // rax = 2/3*(next_low_bits - start_next_low_bits) | |
608 | ||
609 | movq next_full_patt, %rdx | |
610 | subq start_next_full_patt, %rdx // get bytes consumed by dictionary misses | |
611 | addq %rdx, %rax // rax += (next_full_patt - start_next_full_patt) | |
612 | ||
613 | movq next_qp, %rdx | |
614 | subq start_next_qp, %rdx | |
615 | shrq $1, %rdx | |
616 | addq %rdx, %rax // rax += (next_qp - start_next_qp)/2 | |
617 | ||
618 | addq $CHKPT_TAG_BYTES, %rax // rax += bytes taken by the tags | |
619 | cmpq $CHKPT_SHRUNK_BYTES, %rax | |
620 | ja L_budgetExhausted // compressed size exceeds budget | |
621 | jmp L_scan_loop | |
39236c6e | 622 |