git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	This file contains arm64 hand optimized implementation of WKdm memory page compressor.
	31
	32	int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
	33
	34	input :
	35	src_buf : address of input page (length = 1024 words)
	36	dest_buf : address of output buffer (may not be 16-byte aligned)
	37	scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
	38	bytes_budget : a given byte target in compression
	39
	40	output :
	41
	42	if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
	43	o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
	44	During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
	45	When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
	46	The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
	47
	48	The 4th argument bytes_budget is the target compress budget in bytes.
	49	Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
	50	Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
	51
	52	WKdm Compression algorithm is briefly stated as follows:
	53
	54	There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
	55	For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
	56	a. k = (x>>10)&255; // 8-bit hash table index
	57	b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed
	58	c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated
	59
	60	Each input word x is classified/tagged into 4 classes :
	61	0 : x = 0
	62	1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
	63	2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
	64	3 : (x == dict_word), the exact input word is in the dictionary
	65
	66	For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
	67	0 : 2-bits tag (32->2 compression)
	68	1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
	69	2 : 2-bits tag + 32-bits new word (32->34 expansion)
	70	3 : 2-bits tag + 4-bits dict_index (32->6 compression)
	71
	72	It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
	73
	74	the output bit stream (*dest_buf) consists of
	75	a. 12 bytes header
	76	b. 256 bytes for 1024 packed tags
	77	c. (varying number of) words for new words not matched to dictionary word.
	78	d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
	79	e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
	80
	81	the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
	82	Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
	83
	84
	85	The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
	86
	87	The temp buffers are
	88
	89	uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing
	90	uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing
	91	uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing
	92
	93	Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
	94	the destination buffer.
	95
	96	uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags.
	97
	98	Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass.
	99
	100	byte_count -= 12 + 256; // bit budget minus header and tags
	101
	102	whenever an input word is classified as class
	103
	104	2 : byte_count -= 4;
	105
	106	the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0).
	107
	108	without showing the bit budget management, the pseudo code is given as follows:
	109
	110	uint8_t *tags=tempTagsArray;
	111	uint8_t *dict=tempQPosArray;
	112	uint8_t *partial=tempLowBitsArray;
	113
	114	for (i=0;i<1024;i++) {
	115	x = *src_buf++;
	116	if (x == 0) { // zero, 2-bits tag
	117	*tags++ = 0;
	118	} else {
	119
	120	// find dict_index and dict_word from x
	121	k = (x>>10)&255;
	122	dict_index = hashTable[k];
	123	dict_word = dictionary[dict_index];
	124
	125	if (dict_word == x) { // exactly match
	126	// 2-bits tag + 4-bits table index
	127	*tags++ = 3;
	128	*dict++ = dict_index;
	129	} else if (((x^dict_word)>>10)==0) { // 22 higher bits matched
	130	// 2-bits tag + 4-bits table index + 10-bits lower partial
	131	*tags++ = 1;
	132	*dict++ = dict_index;
	133	*partial++ = x &0x3ff;
	134	dictionary[dict_index] = x;
	135	} else { // not matched
	136	// 2-bits tag + 32-bits new word
	137	*tags++ = 2;
	138	*new_word++ = x;
	139	dictionary[dict_index] = x;
	140	}
	141	}
	142	}
	143
	144	after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
	145
	146	1. 1024 tags are packed into 256 bytes right after the 12-bytes header
	147	2. dictionary indices (4-bits each) are packed into are right after the new words section
	148	3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
	149
	150	cclee, 11/9/12
	151
	152	Added zero page, single value page, sparse page, early abort optimizations
	153	rsrini, 09/14/14
	154	*/
	155
	156	#ifndef PAGES_SIZE_IN_KBYTES
	157	#define PAGES_SIZE_IN_KBYTES 4
	158	#endif
	159
	160	#if !((PAGES_SIZE_IN_KBYTES==4) \|\| (PAGES_SIZE_IN_KBYTES==16))
	161	#error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported"
	162	#endif
	163
	164
	165	.text
	166	.align 4
	167
	168	/*
	169	int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
	170	*/
	171
	172	.globl _WKdm_compress_4k
	173	_WKdm_compress_4k:
	174
	175	/*
	176	------------------------- symbolizing register use -----------------------------------
	177	*/
	178	#define src_buf x0
	179	#define next_input_word x0
	180	#define dest_buf x1
	181	#define scratch x2
	182	#define byte_count x3
	183	#define next_tag x4
	184	#define tempTagsArray x2 // scratch
	185	#define dictionary x5
	186	#define remaining x6
	187	#define next_full_patt x7
	188	#define dict_location x8
	189	#define wdict_location w8
	190	#define next_qp x9
	191	#define hashTable x10
	192	#define tempQPosArray x11
	193	#define next_low_bits x12
	194
	195	/*
	196	this arm64 assembly code is ported from x86_64 assembly code,
	197	therefore need such symbolization to quickly reuse the x86_64 assembly code
	198	for these intermediate/temporary register use
	199	*/
	200	#define rax x13
	201	#define eax w13
	202	#define rcx x14
	203	#define ecx w14
	204	#define rdx x15
	205	#define edx w15
	206	#define rdi x0 /* after some point, x0/rdi becomes free other usage */
	207
	208
	209	/*
	210	------------------------- scratch memory --------------------------------------
	211
	212	need 164 (dictionary) + 2564 (tempTagsArray) + 2564 (tempQPosArray) + 10244 (tempLowBitsArray)
	213	total 6208 bytes
	214	[sp,#0] : dictionary
	215	[scratch,#0] : tempTagsArray
	216	[scratch,#1024] : tempQPosArray
	217	[scratch,#2048] : tempLowBitsArray
	218	*/
	219
	220	#define scale (PAGES_SIZE_IN_KBYTES/4)
	221
	222	#define SV_RETURN 0 // return value when SV, ZV page is found
	223	#define MZV_MAGIC 17185 // magic value used to identify MZV page encoding
	224	#define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
	225	#define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words
	226	#define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data
	227	#define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing ..
	228	// .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
	229	#if CHKPT_BYTES > 4096
	230	#error CHKPT_BYTES must be <= 4096
	231	#endif
	232	#if CHKPT_BYTES < 4
	233	#error CHKPT_BYTES must be >= 4
	234	#endif
	235
	236	#if KERNEL
	237	sub sp, sp, #64
	238	st1.4s {v0,v1,v2,v3},[sp]
	239	#endif
	240
	241	sub sp, sp, #64 // allocate for dictionary
	242	mov dictionary, sp // use x5 to point to sp, so we can use sub xd, xn, sp
	243
	244	sub sp, sp, #64 // allocate space for saving callee-saved registers
	245	mov x15, sp
	246	stp x20, x21, [x15, #0] // save x20, x21
	247	stp x22, x23, [x15, #16] // save x22, x23
	248	stp x24, x25, [x15, #32] // save x24, x25
	249	stp x26, x27, [x15, #48] // save x26, x27
	250
	251	/*
	252	------- entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY -------------------
	253	*/
	254
	255	// NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
	256	// THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
	257	mov next_tag, tempTagsArray // &tempTagsArray[0]
	258	add next_qp, scratch, #(1024*scale) // next_qp
	259	mov remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint
	260	add next_full_patt, dest_buf, #(12+256scale) // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]4
	261	sub byte_count, byte_count, #(12+256*scale) // bit_count - header - tags
	262	add next_low_bits, scratch, #(2048*scale) // &tempLowBitsArray[0]
	263	stp xzr, xzr, [dictionary, #0] // initialize dictionary
	264	adrp hashTable, _hashLookupTable@GOTPAGE
	265	stp xzr, xzr, [dictionary, #16] // initialize dictionary
	266	stp xzr, xzr, [dictionary, #32] // initialize dictionary
	267	ldr hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF]
	268	stp xzr, xzr, [dictionary, #48] // initialize dictionary
	269
	270	#define EARLYCHECK 0
	271	#define NORMAL 1
	272
	273	#define mode w20
	274	#define start_next_full_patt x21
	275	#define start_next_input_word x22
	276	#define start_next_low_bits x23
	277	#define r11 x24
	278	#define r13 x25
	279	#define byte_budget x26
	280	#define start_next_qp tempQPosArray
	281
	282	add tempQPosArray, scratch, #(1024*scale) // &tempQPosArray[0]
	283	mov mode, EARLYCHECK // indicate we are yet to evaluate the early aborts
	284	mov start_next_full_patt, next_full_patt // remember the start of next_full_patt
	285	mov start_next_input_word, next_input_word // remember the start of next_input_word
	286	mov start_next_low_bits, next_low_bits // remember the start of next_low_bit
	287	add byte_budget, byte_count, #(12+256*scale) // remember the byte budget
	288
	289	b L_loop
	290
	291	.align 4, 0x90
	292
	293	/* we've just detected a zero input word in edx */
	294	L_RECORD_ZERO:
	295	strb edx, [next_tag], #1 // *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0
	296	subs remaining, remaining, #1 // remaing--;
	297	b.le CHECKPOINT // if remaining = 0, break
	298
	299	/* -------------- scan/tag pass loop ------------------------- */
	300	L_loop:
	301
	302	/* load new input word to edx */
	303	ldr edx, [next_input_word], #4
	304	cbz edx, L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
	305
	306	/*
	307	now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
	308	*/
	309	ubfm eax, edx, #10, #17
	310	ldrb wdict_location, [hashTable, rax] // HASH_TO_DICT_BYTE_OFFSET(input_word)
	311	ldr eax, [dictionary, dict_location] // dict_word = *dict_location;
	312
	313	/* detect whether we match input to its corresponding dictionary word */
	314	eor eax, eax, edx // dict_word vs input_word
	315	cbz eax, L_RECORD_EXACT // if identical, RECORD_EXACT
	316	lsr eax, eax, #10 // HIGH_BITS(dict_word^input_word)
	317	cbz eax, L_RECORD_PARTIAL // if identical, RECORD_PARTIAL
	318
	319	L_RECORD_MISS:
	320	/*
	321	if we are here, the input word can not be derived from the dictionary,
	322	we write the input word as a new word,
	323	and update the dictionary with this new word
	324	*/
	325	subs byte_count, byte_count, #4 // byte_count -= 4
	326	b.le L_budgetExhausted // return -1 to signal this page is not compressable
	327	str edx, [next_full_patt], #4 // *next_full_patt++ = input_word;
	328	mov eax, #2 // tag for MISS
	329	subs remaining, remaining, #1 // remaing--;
	330	str edx, [dictionary, dict_location] // *dict_location = input_word
	331	strb eax, [next_tag], #1 // *next_tag++ = 2 for miss
	332	b.gt L_loop // // if remaining > 0, repeat
	333	b CHECKPOINT
	334
	335	L_done_search:
	336
	337	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
	338	/* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */
	339
	340	sub rax, next_full_patt, dest_buf // next_full_patt - dest_buf
	341	lsr eax, eax, #2 // offset in 4-bytes
	342	str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf
	343
	344	/* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/
	345	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
	346
	347	add rdi, dest_buf, #12 // dest_buf
	348	mov rcx, tempTagsArray // &tempTagsArray[0]
	349
	350	L_pack_2bits:
	351	ld1.2s {v0,v1,v2,v3},[rcx],#32
	352
	353	shl.2d v1,v1,#4
	354	shl.2d v3,v3,#4
	355
	356	orr.8b v0, v0, v1
	357	orr.8b v2, v2, v3
	358
	359	ushr.2d v1, v0, #30
	360	ushr.2d v3, v2, #30
	361
	362	orr.8b v0, v0, v1
	363	orr.8b v2, v2, v3
	364
	365	zip1.2s v0, v0, v2
	366	st1.2s {v0},[rdi],#8
	367	cmp next_tag, rcx
	368	b.hi L_pack_2bits
	369
	370	/* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */
	371
	372	/* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
	373	sub rax, next_qp, tempQPosArray // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
	374	add eax, eax, #7 // num_bytes_to_pack+7
	375	lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3
	376	add rcx, tempQPosArray, rax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words
	377	lsl rax, rax, #2
	378	subs byte_count, byte_count, rax
	379	b.lt L_budgetExhausted
	380
	381	cmp rcx, next_qp // endQPosArray vs next_qp
	382	b.ls 2f // if (next_qp >= endQPosArray) skip the following zero paddings
	383	sub rax, rcx, next_qp
	384	mov edx, #0
	385	tst eax, #4
	386	b.eq 1f
	387	str edx, [next_qp], #4
	388	1: tst eax, #2
	389	b.eq 1f
	390	strh edx, [next_qp], #2
	391	1: tst eax, #1
	392	b.eq 2f
	393	strb edx, [next_qp], #1
	394	2:
	395	mov rdi, next_full_patt // next_full_patt
	396	cmp rcx, tempQPosArray // endQPosArray vs tempQPosArray
	397	ldr eax, [dest_buf]
	398	b.ls L20 // if (endQPosArray <= tempQPosArray) skip the following
	399	mov rdx, tempQPosArray // tempQPosArray
	400
	401	/* packing 4-bits dict indices into dest_buf */
	402	L_pack_4bits:
	403	ldr rax, [rdx], #8 // src_next[1]:src_next[0]
	404	orr rax, rax, rax, lsr #28 // eax = src_next[0] \| (src_next[1] << 4)
	405	cmp rcx, rdx // source_end vs src_next
	406	str eax, [rdi], #4 // *dest_next++ = temp;
	407	b.hi L_pack_4bits // while (src_next < source_end) repeat the loop
	408
	409	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
	410	sub rax, rdi, dest_buf // boundary_tmp - dest_buf
	411	lsr eax, eax, #2 // boundary_tmp - dest_buf in words
	412	L20:
	413	str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf
	414
	415
	416
	417	/* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */
	418
	419	add rcx, scratch, #(2048*scale) // tempLowBitsArray
	420	sub rdx, next_low_bits, rcx // next_low_bits - tempLowBitsArray (in bytes)
	421	lsr rdx, rdx, #1 // num_tenbits_to_pack (in half-words)
	422	subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3
	423	b.lt 1f // if num_tenbits_to_pack < 3, skip the following loop
	424	0:
	425	subs byte_count, byte_count, #4 // byte_count -= 4
	426	b.le L_budgetExhausted // return -1 to signal this page is not compressable
	427	subs edx, edx, #3 // num_tenbits_to_pack-=3
	428	ldr rax, [rcx], #6
	429	bfm rax, rax, #58, #9 // pack 1st toward 2nd
	430	bfm rax, rax, #58, #25 // pack 1st/2nd toward 3rd
	431	lsr rax, rax, #12
	432	str eax, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word
	433	b.ge 0b // if no less than 3 elements, back to loop head
	434
	435	1: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3
	436	b.eq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
	437	subs byte_count, byte_count, #4 // byte_count -= 4
	438	b.le L_budgetExhausted // return -1 to signal this page is not compressable
	439	ldrh eax,[rcx] // w0
	440	subs edx, edx, #1 // num_tenbits_to_pack--
	441	b.eq 2f //
	442	ldrh edx, [rcx, #2] // w1
	443	orr eax, eax, edx, lsl #10 // w0 \| (w1<<10)
	444
	445	2: str eax, [rdi], #4 // write the final dest_buf word
	446
	447	3: sub rax, rdi, dest_buf // boundary_tmp - dest_buf
	448	lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words
	449	str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
	450	lsl w0, eax, #2 // boundary_tmp - dest_buf in terms of bytes
	451
	452	L_done:
	453
	454	// restore registers and return
	455	mov x15, sp
	456	ldp x20, x21, [x15, #0] // restore x20, x21
	457	ldp x22, x23, [x15, #16] // restore x22, x23
	458	ldp x24, x25, [x15, #32] // restore x24, x25
	459	ldp x26, x27, [x15, #48] // restore x26, x27
	460	add sp, sp, #128 // deallocate for dictionary + saved register space
	461
	462	#if KERNEL
	463	ld1.4s {v0,v1,v2,v3},[sp],#64
	464	#endif
	465	ret lr
	466
	467	.align 4
	468	L_budgetExhausted:
	469	mov x0, #-1
	470	b L_done
	471
	472
	473	.align 4,0x90
	474	L_RECORD_EXACT:
	475	/*
	476	we have an exact match of the input word to its corresponding dictionary word
	477	write tag/dict_index to the temorary buffers
	478	*/
	479	mov eax, #3
	480	lsr w14, wdict_location, #2 // divide by 4 for word offset
	481	subs remaining, remaining, #1 // remaing--;
	482	strb eax, [next_tag], #1 // *next_tag++ = 3 for exact
	483	strb w14, [next_qp], #1 // *next_qp = word offset (4-bit)
	484	b.gt L_loop
	485	b CHECKPOINT // if remaining = 0, break
	486
	487	.align 4,0x90
	488	L_RECORD_PARTIAL:
	489	/*
	490	we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
	491	write tag/dict_index/low 10 bits to the temorary buffers
	492	*/
	493	mov ecx, #1
	494	strb ecx, [next_tag], #1 // *next_tag++ = 1 for partial matched
	495	str edx, [dictionary, dict_location] // *dict_location = input_word;
	496	subs remaining, remaining, #1 // remaing--;
	497	lsr eax, wdict_location, #2 // offset in 32-bit word
	498	and edx, edx, #1023 // lower 10 bits
	499	strb eax, [next_qp], #1 // update *next_qp++
	500	strh edx, [next_low_bits], #2 // save next_low_bits++
	501	b.gt L_loop
	502
	503	CHECKPOINT:
	504
	505	cbz mode, L_check_compression_ratio // if this this an early abort check..
	506
	507	L_check_zero_page:
	508
	509	cmp start_next_full_patt, next_full_patt // check if any dictionary misses in page
	510	b.ne L_check_single_value_page
	511
	512	cmp start_next_qp, next_qp // check if any partial or exact dictionary matches
	513	b.ne L_check_single_value_page
	514
	515	mov x0, #SV_RETURN // Magic return value
	516	b L_done
	517
	518	L_check_single_value_page:
	519
	520	sub rax, next_full_patt, start_next_full_patt // get # dictionary misses
	521	lsr rax, rax, #2
	522
	523	sub r11, next_qp, start_next_qp // get # dictionary hits (exact + partial)
	524
	525	sub r13, next_low_bits, start_next_low_bits // get # dictionary partial hits
	526	lsr r13, r13, #1
	527
	528	// Single value page if one of the follwoing is true:
	529	// partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss)
	530	// partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial)
	531	//
	532	cbnz r13, 1f // were there 0 partial hits?
	533
	534	cmp r11, #(256*PAGES_SIZE_IN_KBYTES - 1) // were there 1023 dictionary hits
	535	b.ne 1f
	536
	537	cmp rax, #1 // was there exacly 1 dictionary miss?
	538	b.ne 1f
	539
	540	ldrb edx, [tempTagsArray] // read the very 1st tag
	541	cmp edx, #2 // was the very 1st tag a miss?
	542	b.eq L_is_single_value_page
	543
	544	1:
	545	cmp r13, #1 // was there 1 partial hit?
	546	b.ne L_check_mostly_zero
	547
	548	cmp r11, #(256*PAGES_SIZE_IN_KBYTES) // were there 1024 dictionary hits
	549	b.ne L_check_mostly_zero
	550
	551	ldrb edx, [tempTagsArray] // read the very 1st tag
	552	cmp edx, #1 // was the very 1st tag a partial?
	553	b.ne L_check_mostly_zero
	554
	555	L_is_single_value_page:
	556
	557	mov x0, #SV_RETURN // Magic return value
	558	b L_done
	559
	560	L_check_mostly_zero:
	561	// how much space will the sparse packer take?
	562	add rax, rax, r11 // rax += (next_qp - start_next_qp)
	563	mov rdx, #6
	564	mov rcx, #4
	565	madd r11, rax, rdx, rcx // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
	566
	567	sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits
	568	mov rdx, #1365
	569	mul rax, rax, rdx
	570
	571	sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses
	572	add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
	573
	574	sub rdx, next_qp, start_next_qp
	575	add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2
	576	add rax, rax, #(12+256*scale) // rax += bytes taken by the header + tags
	577
	578	cmp rax, r11 // is the default packer the better option?
	579	b.lt L_done_search
	580
	581	cmp r11, byte_budget // can the sparse packer fit into the given budget?
	582	b.gt L_budgetExhausted
	583
	584	L_sparse_packer:
	585	mov edx, #MZV_MAGIC
	586	str edx, [dest_buf], #4 // header to indicate a sparse packer
	587
	588	mov rdx, #0 // rdx = byte offset in src of non-0 word
	589	1:
	590	ldr rax, [start_next_input_word, rdx] // rax = read dword
	591	cbnz rax, 5f // is dword != 0
	592	3:
	593	add rdx, rdx, #8 // 8 more bytes have been processed
	594	4:
	595	cmp rdx, #(4096*scale) // has the entire page been processed
	596	b.ne 1b
	597	mov x0, r11 // store the size of the compressed stream
	598	b L_done
	599
	600	5:
	601	cbz eax, 6f // is lower word == 0
	602	str eax, [dest_buf], #4 // store the non-0 word in the dest buffer
	603	strh edx, [dest_buf], #2 // store the byte index
	604	6:
	605	lsr rax, rax, 32 // get the upper word into position
	606	cbz eax, 3b // is dword == 0
	607	add rdx, rdx, #4
	608	str eax, [dest_buf], #4 // store the non-0 word in the dest buffer
	609	strh edx, [dest_buf], #2 // store the byte index
	610	add rdx, rdx, #4
	611	b 4b
	612
	613	L_check_compression_ratio:
	614
	615	mov mode, NORMAL
	616	mov remaining, #((1024 - CHKPT_WORDS)*scale) // remaining input words to process
	617	cbz remaining, CHECKPOINT // if there are no remaining words to process
	618
	619	sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits
	620	mov rdx, #1365
	621	mul rax, rax, rdx
	622
	623	sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses
	624	add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
	625
	626	sub rdx, next_qp, start_next_qp
	627	add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2
	628	subs rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale)
	629	// rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES
	630
	631	b.gt L_budgetExhausted // if rax is > 0, we need to early abort
	632	b L_loop // we are done