git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2013 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	This file contains armv7 hand optimized implementation of WKdm memory page compressor.
	31
	32	int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
	33
	34	input :
	35	src_buf : address of input page (length = 1024 words)
	36	dest_buf : address of output buffer (may not be 16-byte aligned)
	37	scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
	38	bytes_budget : a given byte target in compression
	39
	40	output :
	41
	42	if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
	43	o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
	44	During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
	45	When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
	46	The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
	47
	48	The 4th argument bytes_budget is the target compress budget in bytes.
	49	Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
	50	Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
	51
	52	WKdm Compression algorithm is briefly stated as follows:
	53
	54	There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
	55	For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
	56	a. k = (x>>10)&255; // 8-bit hash table index
	57	b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed
	58	c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated
	59
	60	Each input word x is classified/tagged into 4 classes :
	61	0 : x = 0
	62	1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
	63	2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
	64	3 : (x == dict_word), the exact input word is in the dictionary
	65
	66	For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
	67	0 : 2-bits tag (32->2 compression)
	68	1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
	69	2 : 2-bits tag + 32-bits new word (32->34 expansion)
	70	3 : 2-bits tag + 4-bits dict_index (32->6 compression)
	71
	72	It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
	73
	74	the output bit stream (*dest_buf) consists of
	75	a. 12 bytes header
	76	b. 256 bytes for 1024 packed tags
	77	c. (varying number of) words for new words not matched to dictionary word.
	78	d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
	79	e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
	80
	81	the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
	82	Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
	83
	84
	85	The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
	86
	87	The temp buffers are
	88
	89	uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing
	90	uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing
	91	uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing
	92
	93	Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
	94	the destination buffer.
	95
	96	uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags.
	97
	98	Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass.
	99
	100	byte_count = bytes_budget - 12 - 256; // header + tags
	101
	102	whenever an input word is classified as class
	103
	104	2 : byte_count -= 4;
	105
	106	in 4-bit/10-bit packing, we can also return -1 when byte_budget <=0;
	107
	108	Note : since there might be extra padding bits for class 1 and 3, it is complicated to track this padding bits on the fly. To compromise, we change class 1 to
	109
	110	without showing the bit budget management, the pseudo code is given as follows:
	111
	112	uint8_t *tags=tempTagsArray;
	113	uint8_t *dict=tempQPosArray;
	114	uint8_t *partial=tempLowBitsArray;
	115
	116	for (i=0;i<1024;i++) {
	117	x = *src_buf++;
	118	if (x == 0) { // zero, 2-bits tag
	119	*tags++ = 0;
	120	} else {
	121
	122	// find dict_index and dict_word from x
	123	k = (x>>10)&255;
	124	dict_index = hashTable[k];
	125	dict_word = dictionary[dict_index];
	126
	127	if (dict_word == x) { // exactly match
	128	// 2-bits tag + 4-bits table index
	129	*tags++ = 3;
	130	*dict++ = dict_index;
	131	} else if (((x^dict_word)>>10)==0) { // 22 higher bits matched
	132	// 2-bits tag + 4-bits table index + 10-bits lower partial
	133	*tags++ = 1;
	134	*dict++ = dict_index;
	135	*partial++ = x &0x3ff;
	136	dictionary[dict_index] = x;
	137	} else { // not matched
	138	// 2-bits tag + 32-bits new word
	139	*tags++ = 2;
	140	*new_word++ = x;
	141	dictionary[dict_index] = x;
	142	}
	143	}
	144	}
	145
	146	after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
	147
	148	1. 1024 tags are packed into 256 bytes right after the 12-bytes header
	149	2. dictionary indices (4-bits each) are packed into are right after the new words section
	150	3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
	151
	152	cclee, 11/9/12
	153
	154	Added zero page, single value page, sparse page, early abort optimizations
	155	rsrini, 09/14/14
	156	*/
	157	.text
	158	.align 4
	159
	160	// int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
	161
	162	.globl _WKdm_compress_new
	163	_WKdm_compress_new:
	164
	165	/*
	166	------------------------- symbolizing register use -----------------------------------
	167	*/
	168
	169	#define src_buf r0
	170	#define next_input_word r0
	171	#define dest_buf r1
	172	#define scratch r2
	173	#define dictionary sp
	174	#define byte_count r3
	175
	176	#define next_tag r12
	177
	178	#define remaining r4
	179	#define next_full_patt r5
	180	#define dict_location r6
	181	#define next_qp r8
	182	#define hashTable r9
	183	#define next_low_bits r10
	184	#define eax r11
	185	#define ecx r12
	186	#define edx lr
	187	#define rdi r6
	188
	189	#define tempTagsArray scratch
	190	#define R11 r0 // only safe to use between phase-1 and phase-2
	191	#define R13 r4 // only safe to use between phase-1 and phase-2
	192	/*
	193	------------------------- allocate scratch memory for local use --------------------------------------
	194
	195	need 2564 (tempTagsArray) + 2564 (tempQPosArray) + 1024*2 (tempLowBitsArray)
	196	total 4096 bytes
	197	[scratch,#0] : tempTagsArray
	198	[scratch,#1024] : tempQPosArray
	199	[scratch,#2048] : tempLowBitsArray
	200
	201	[sp,#0] : dictionary
	202
	203	*/
	204
	205	#define TagsArray_offset 0
	206	#define QPosArray_offset 1024
	207	#define LowBitsArray_offset 2048
	208
	209	#define SV_RETURN 0 // return value when SV, ZV page is found
	210	#define MZV_MAGIC 17185 // magic value used to identify MZV page encoding
	211	#define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
	212	#define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words
	213	#define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data
	214	#define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing ..
	215	// .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
	216	#if CHKPT_BYTES > 4096
	217	#error CHKPT_BYTES must be <= 4096
	218	#endif
	219	#if CHKPT_BYTES < 4
	220	#error CHKPT_BYTES must be >= 4
	221	#endif
	222
	223	push {r7,lr}
	224	mov r7, sp
	225	push {r4-r6,r8-r11}
	226
	227	#if KERNEL
	228	sub sp, sp, #32
	229	vst1.64 {q0,q1}, [sp]
	230	#endif
	231
	232	sub sp, sp, #(64+24) // reserve stack space for temps + dictionary
	233
	234	/*
	235	----- set up registers and initialize WKdm dictionary ----------
	236	*/
	237	// NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
	238	// THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
	239	mov eax, #0
	240
	241	mov next_tag, scratch // &tempTagsArray[0]
	242	vdup.32 q0, eax
	243
	244	add next_qp, scratch, #QPosArray_offset // next_qp
	245	mov lr, sp
	246	mov remaining, #(CHKPT_WORDS) // remaining input words .. initially set to checkpoint
	247	vst1.64 {q0}, [lr]!
	248	add next_full_patt, dest_buf, #268 // dest_buf + [TAGS_AREA_OFFSET + (4096 / 16)]*4
	249	vst1.64 {q0}, [lr]!
	250	vst1.64 {q0}, [lr]!
	251	add next_low_bits, scratch, #LowBitsArray_offset // &tempLowBitsArray[0]
	252	vst1.64 {q0}, [lr]!
	253
	254	#if defined(KERNEL) && !SLIDABLE
	255	adr hashTable, L_table
	256	ldr hashTable, [hashTable]
	257	#else
	258	ldr hashTable, L_table
	259	L_table0:
	260	ldr hashTable, [pc, hashTable]
	261	#endif
	262
	263	#define EARLYCHECK 0
	264	#define NORMAL 1
	265
	266	#define mode [sp, #64]
	267	#define start_next_full_patt [sp, #68]
	268	#define start_next_input_word [sp, #72]
	269	#define start_next_low_bits [sp, #76]
	270	#define byte_budget [sp, #80]
	271
	272	mov edx, #EARLYCHECK
	273	str edx, mode // indicate we are yet to evaluate the early aborts
	274	str next_full_patt, start_next_full_patt // remember the start of next_full_patt
	275	str next_input_word, start_next_input_word // remember the start of next_input_word
	276	str next_low_bits, start_next_low_bits // remember the start of next_low_bits
	277	str byte_count, byte_budget // remember the byte budget
	278
	279	sub byte_count, byte_count, #(12+256) // byte_count - header bytes - tags bytes
	280	b L_scan_loop
	281
	282	.align 4, 0x90
	283	L_RECORD_ZERO:
	284	/* we've just detected a zero input word in edx */
	285	strb edx, [next_tag], #1 // *next_tag++ = ZERO;
	286	subs remaining, remaining, #1 // remaining input words
	287	ble CHECKPOINT // if remaining = 0, break
	288
	289	/* WKdm compress scan/tag loop */
	290	L_scan_loop:
	291	ldr edx, [next_input_word], #4
	292	cmp edx, #0
	293	beq L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
	294
	295	/*
	296	now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
	297	*/
	298	and eax, edx, #(0xff<<10) // part of input_word for hash table index
	299	lsr eax, eax, #10 // 8-bit index to the Hash Table
	300	ldrb eax, [hashTable, eax] // HASH_TO_DICT_BYTE_OFFSET(input_word)
	301	add dict_location, dictionary, eax // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
	302	ldr eax, [dictionary, eax] // dict_word = *dict_location;
	303	cmp eax, edx // dict_word vs input_word
	304	beq L_RECORD_EXACT // if identical, RECORD_EXACT
	305
	306	eor eax, eax, edx
	307	lsrs eax, eax, #10 // HIGH_BITS(dict_word)
	308	beq L_RECORD_PARTIAL // if identical, RECORD_PARTIAL
	309
	310	L_RECORD_MISS:
	311	/*
	312	if we are here, the input word can not be derived from the dictionary,
	313	we write the input word as a new word,
	314	and update the dictionary with this new word
	315	*/
	316
	317	subs byte_count, byte_count, #4
	318	ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable
	319	str edx, [next_full_patt], #4 // *next_full_patt++ = input_word;
	320	mov eax, #2
	321	str edx, [dict_location] // *dict_location = input_word
	322	strb eax, [next_tag], #1 // *next_tag++ = 2 for miss
	323	subs remaining, remaining, #1 // remaining input words
	324	bgt L_scan_loop // if bit_count>0, go on the scan/tag pass,
	325	b CHECKPOINT
	326
	327	L_done_search:
	328
	329	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
	330	sub eax, next_full_patt, dest_buf // next_full_patt - dest_buf
	331	lsr eax, eax, #2 // offset in 4-bytes
	332	str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf
	333
	334
	335	/* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/
	336	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
	337
	338	add rdi, dest_buf, #12 // dest_buf
	339	mov eax, scratch // &tempTagsArray[0]
	340	sub edx, next_tag, scratch // this should be 1024
	341
	342	vld1.64 {q0,q1}, [eax,:128]!
	343	subs edx, edx, #32 // pre-decrement by 32
	344	L_pack_2bits:
	345	subs edx, edx, #32
	346	vshl.i64 d1, d1, #4
	347	vshl.i64 d3, d3, #4
	348	vorr d0, d0, d1
	349	vorr d2, d2, d3
	350	vshr.u64 d1, d0, #30
	351	vshr.u64 d3, d2, #30
	352	vorr d0, d0, d1
	353	vorr d2, d2, d3
	354	vzip.32 d0, d2
	355	vst1.64 {d0}, [rdi]!
	356	vld1.64 {q0,q1}, [eax,:128]!
	357	bgt L_pack_2bits
	358	vshl.i64 d1, d1, #4
	359	vshl.i64 d3, d3, #4
	360	vorr d0, d0, d1
	361	vorr d2, d2, d3
	362	vshr.u64 d1, d0, #30
	363	vshr.u64 d3, d2, #30
	364	vorr d0, d0, d1
	365	vorr d2, d2, d3
	366	vzip.32 d0, d2
	367	vst1.64 {d0}, [rdi]
	368
	369
	370	/* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */
	371
	372	/* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
	373	add ecx, scratch, #QPosArray_offset // tempQPosArray
	374	sub eax, next_qp, ecx // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
	375	add eax, eax, #7 // num_bytes_to_pack+7
	376	lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3
	377	subs byte_count, byte_count, eax, lsl #2 // byte_count -= 4 * packed_words
	378	blt L_budgetExhausted // o.w., return -1 to signal this page is not compressable
	379	add ecx, ecx, eax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words
	380	cmp ecx, next_qp // endQPosArray vs next_qp
	381	bls L16 // if (next_qp >= endQPosArray) skip the following zero paddings
	382	sub eax, ecx, next_qp
	383	mov edx, #0
	384	tst eax, #4
	385	beq 1f
	386	str edx, [next_qp], #4
	387	1: tst eax, #2
	388	beq 1f
	389	strh edx, [next_qp], #2
	390	1: tst eax, #1
	391	beq 1f
	392	strb edx, [next_qp], #1
	393	1:
	394	L16:
	395	add edx, scratch, #QPosArray_offset // tempQPosArray
	396	mov rdi, next_full_patt // next_full_patt
	397	cmp ecx, edx // endQPosArray vs tempQPosArray
	398	ldr eax, [dest_buf]
	399	bls L20 // if (endQPosArray <= tempQPosArray) skip the following
	400
	401	/* packing 4-bits dict indices into dest_buf */
	402	L_pack_4bits:
	403	vld1.64 {d0}, [edx,:64]! // src_next[1]:src_next[0]
	404	vshr.u64 d1, d0, #28 // (src_next[1] << 4)
	405	vorr d0, d0, d1 // src_next[0] \| (src_next[1] << 4)
	406	cmp ecx, edx // source_end vs src_next
	407	vstr s0, [rdi]
	408	add rdi, rdi, #4
	409	bhi L_pack_4bits // while (src_next < source_end) repeat the loop
	410
	411	/* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */
	412	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
	413	sub eax, rdi, dest_buf // boundary_tmp - dest_buf
	414	lsr eax, eax, #2 // boundary_tmp - dest_buf in words
	415	L20:
	416	str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf
	417
	418	add ecx, scratch, #LowBitsArray_offset // tempLowBitsArray
	419	sub edx, next_low_bits, ecx // next_low_bits - tempLowBitsArray (in bytes)
	420	lsr edx, edx, #1 // num_tenbits_to_pack (in half-words)
	421	subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3
	422	blt 1f // if num_tenbits_to_pack < 3, skip the following loop
	423	0:
	424	subs byte_count, byte_count, #4 // byte_count -= 4
	425	ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable
	426	ldr r4,[ecx, #2] // w2:6bits:w1
	427	ldrh r0,[ecx], #6 // w0
	428	uxth r5, r4, ror #16 // w2
	429	uxth r4, r4 // w1
	430	orr r0, r0, r4, lsl #10 // w1:w0
	431	subs edx, edx, #3 // num_tenbits_to_pack-=3
	432	orr r0, r0, r5, lsl #20 // w2:w1:w0
	433	str r0, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word
	434	bge 0b // if no less than 3 elements, back to loop head
	435
	436	1: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3
	437	beq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
	438	subs byte_count, byte_count, #4 // byte_count -= 4
	439	ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable
	440	ldrh eax,[ecx] // w0
	441	subs edx, edx, #1 // num_tenbits_to_pack--
	442	beq 2f //
	443	ldrh edx, [ecx, #2] // w1
	444	orr eax, eax, edx, lsl #10 // w0 \| (w1<<10)
	445
	446	2: str eax, [rdi], #4 // write the final dest_buf word
	447
	448	3: sub eax, rdi, dest_buf // boundary_tmp - dest_buf
	449	lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words
	450	str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
	451	lsl r0, eax, #2 // boundary_tmp - dest_buf in terms of bytes
	452
	453	L_done:
	454	// restore registers and return
	455
	456	add sp, sp, #(64+24) // skip memory for temps + dictionary
	457	#if KERNEL
	458	vld1.64 {q0,q1}, [sp]!
	459	#endif
	460	pop {r4-r6,r8-r11}
	461	pop {r7,pc}
	462
	463	.align 4
	464	L_budgetExhausted:
	465	mov r0, #-1
	466	b L_done
	467
	468
	469	.align 4,0x90
	470	L_RECORD_EXACT:
	471	/*
	472	we have an exact match of the input word to its corresponding dictionary word
	473	write tag/dict_index to the temorary buffers
	474	*/
	475	sub edx, dict_location, dictionary // dict_location - dictionary
	476	mov eax, #3
	477	lsr edx, edx, #2 // divide by 4 for word offset
	478	strb eax, [next_tag], #1 // *next_tag++ = 3 for exact
	479	strb edx, [next_qp], #1 // *next_qp = word offset (4-bit)
	480	subs remaining, remaining, #1 // remaining input words
	481	bgt L_scan_loop // if remaining>0, go on the scan/tag pass,
	482	b CHECKPOINT // if remaining = 0, break
	483
	484	.align 4,0x90
	485	L_RECORD_PARTIAL:
	486	/*
	487	we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
	488	write tag/dict_index/low 10 bits to the temorary buffers
	489	*/
	490	sub eax, dict_location, dictionary // dict_location - dictionary
	491	str edx, [dict_location] // *dict_location = input_word;
	492	lsr eax, eax, #2 // offset in 32-bit word
	493	lsl edx, edx, #22
	494	strb eax, [next_qp], #1 // update *next_qp++
	495	mov eax, #1
	496	lsr edx, edx, #22 // lower 10 bits
	497	strb eax, [next_tag], #1 // *next_tag++ = 1 for partial matched
	498	strh edx, [next_low_bits], #2 // save next_low_bits++
	499	subs remaining, remaining, #1 // remaining input words
	500	bgt L_scan_loop // if remaining>0, go on the scan/tag pass,
	501
	502	CHECKPOINT:
	503	ldr eax, mode // load the mode
	504	cmp eax, #EARLYCHECK
	505	beq L_check_compression_ratio // early abort check
	506
	507	L_check_zero_page:
	508
	509	ldr eax, start_next_full_patt // check if any dictionary misses in page
	510	cmp eax, next_full_patt
	511	bne L_check_single_value_page
	512
	513	add eax, scratch, #QPosArray_offset // get start_next_qp
	514	cmp eax, next_qp // check if any partial or exact dictionary matches
	515
	516	moveq r0, #SV_RETURN // Magic return value
	517	beq L_done
	518
	519	L_check_single_value_page:
	520
	521	ldr eax, start_next_full_patt // get # dictionary misses
	522	sub eax, next_full_patt, eax
	523	lsr eax, eax, #2
	524
	525	add R11, scratch, #QPosArray_offset // get start_next_qp
	526	sub R11, next_qp, R11 // get # dictionary hits (exact + partial)
	527
	528	ldr R13, start_next_low_bits
	529	sub R13, next_low_bits, R13 // get # dictionary partial hits
	530	lsrs R13, R13, #1
	531
	532	// Single value page if one of the follwoing is true:
	533	// partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss)
	534	// partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial)
	535	//
	536	bne 1f // were there 0 partial hits?
	537
	538	mov edx, #1023
	539	cmp R11, edx // were there 1023 dictionary hits
	540	bne 1f
	541
	542	cmp eax, #1 // was there exacly 1 dictionary miss?
	543	bne 1f
	544
	545	ldrb edx, [tempTagsArray] // read the very 1st tag
	546	cmp edx, #2 // was the very 1st tag a miss?
	547	beq L_is_single_value_page
	548
	549	1:
	550	cmp R13, #1 // was there 1 partial hit?
	551	bne L_check_mostly_zero
	552
	553	mov edx, #1024
	554	cmp R11, edx // were there 1024 dictionary hits
	555	bne L_check_mostly_zero
	556
	557	ldrb edx, [tempTagsArray] // read the very 1st tag
	558	cmp edx, #1 // was the very 1st tag a partial?
	559	bne L_is_single_value_page
	560
	561	L_is_single_value_page:
	562
	563	moveq r0, #SV_RETURN // Magic return value
	564	beq L_done
	565
	566	L_check_mostly_zero:
	567	// how much space will the sparse packer take?
	568	add eax, eax, R11 // eax += (next_qp - start_next_qp)
	569	mov edx, #6
	570	mov R11, #4
	571	mla R11, eax, edx, R11 // R11 = eax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
	572
	573	ldr eax, start_next_low_bits
	574	sub eax, next_low_bits, eax // get bytes consumed by lower-10 bits
	575	mov edx, #1365
	576	mul eax, eax, edx
	577
	578	ldr edx, start_next_full_patt
	579	sub edx, next_full_patt, edx // get bytes consumed by dictionary misses
	580	add eax, edx, eax, lsr #11 // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
	581
	582	add edx, scratch, #QPosArray_offset // get start_next_qp
	583	sub edx, next_qp, edx
	584	add eax, eax, edx, lsr #1 // eax += (next_qp - start_next_qp)/2
	585	mov edx, #(12+256)
	586	add eax, eax, edx // rax += bytes taken by the header + tags
	587
	588	cmp eax, R11 // is the default packer the better option?
	589	blt L_done_search
	590
	591	ldr edx, byte_budget
	592	cmp R11, edx // can the sparse packer fit into the given budget?
	593	bgt L_budgetExhausted
	594
	595	L_sparse_packer:
	596
	597	mov edx, #MZV_MAGIC
	598	str edx, [dest_buf], #4 // header to indicate a sparse packer
	599
	600	ldr R13, start_next_input_word // get the starting address of src
	601	mov edx, #0
	602	mov ecx, #4096
	603
	604	1:
	605	ldm R13!, {r2, r3, r5, r6, r7, r8, r9, r10}
	606
	607	teq r2, #0
	608	teqeq r3, #0
	609	teqeq r5, #0
	610	teqeq r6, #0
	611	teqeq r7, #0
	612	teqeq r8, #0
	613	teqeq r9, #0
	614	teqeq r10, #0
	615
	616	bne 2f
	617	subs ecx, ecx, #32
	618	add edx, edx, #32 // 16 more bytes have been processed
	619	bne 1b
	620	mov r0, R11 // store the size of the compressed stream
	621	b L_done
	622
	623	2:
	624	teq r2, #0
	625	strne r2, [dest_buf], #4 // store the non-0 word in the dest buffer
	626	strhne edx, [dest_buf], #2 // store the byte index
	627	add edx, edx, 4
	628
	629	teq r3, #0
	630	strne r3, [dest_buf], #4 // store the non-0 word in the dest buffer
	631	strhne edx, [dest_buf], #2 // store the byte index
	632	add edx, edx, 4
	633
	634	teq r5, #0
	635	strne r5, [dest_buf], #4 // store the non-0 word in the dest buffer
	636	strhne edx, [dest_buf], #2 // store the byte index
	637	add edx, edx, 4
	638
	639	teq r6, #0
	640	strne r6, [dest_buf], #4 // store the non-0 word in the dest buffer
	641	strhne edx, [dest_buf], #2 // store the byte index
	642	add edx, edx, 4
	643
	644	teq r7, #0
	645	strne r7, [dest_buf], #4 // store the non-0 word in the dest buffer
	646	strhne edx, [dest_buf], #2 // store the byte index
	647	add edx, edx, 4
	648
	649	teq r8, #0
	650	strne r8, [dest_buf], #4 // store the non-0 word in the dest buffer
	651	strhne edx, [dest_buf], #2 // store the byte index
	652	add edx, edx, 4
	653
	654	teq r9, #0
	655	strne r9, [dest_buf], #4 // store the non-0 word in the dest buffer
	656	strhne edx, [dest_buf], #2 // store the byte index
	657	add edx, edx, 4
	658
	659	teq r10, #0
	660	strne r10, [dest_buf], #4 // store the non-0 word in the dest buffer
	661	strhne edx, [dest_buf], #2 // store the byte index
	662	add edx, edx, 4
	663
	664	subs ecx, ecx, #32
	665	bne 1b
	666	mov r0, R11 // store the size of the compressed stream
	667	b L_done
	668
	669	L_check_compression_ratio:
	670
	671	mov eax, #NORMAL
	672	str eax, mode
	673	mov remaining, #(1024 - CHKPT_WORDS) // remaining input words to process
	674	cmp remaining, #0
	675	beq CHECKPOINT // if there are no remaining words to process
	676
	677	ldr eax, start_next_low_bits
	678	sub eax, next_low_bits, eax // get bytes consumed by lower-10 bits
	679	mov edx, #1365
	680	mul eax, eax, edx
	681
	682	ldr edx, start_next_full_patt
	683	sub edx, next_full_patt, edx // get bytes consumed by dictionary misses
	684	add eax, edx, eax, lsr #11 // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
	685
	686	add edx, scratch, #QPosArray_offset // get start_next_qp
	687	sub edx, next_qp, edx
	688	add eax, eax, edx, lsr #1 // eax += (next_qp - start_next_qp)/2
	689	mov edx, #(CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)
	690	subs eax, eax, edx // eax += CHKPT_TAG_BYTES; eax -= CHKPT_SHRUNK_BYTES
	691	bgt L_budgetExhausted // if eax is > 0, we need to early abort
	692	b L_scan_loop // we are done
	693
	694
	695	#if defined(KERNEL) && !SLIDABLE
	696	.align 2
	697	L_table:
	698	.long _hashLookupTable_new
	699	#else
	700	.align 2
	701	L_table:
	702	.long L_Tab$non_lazy_ptr-(L_table0+8)
	703
	704	.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
	705	.align 2
	706	L_Tab$non_lazy_ptr:
	707	.indirect_symbol _hashLookupTable_new
	708	.long 0
	709	#endif
	710