]> git.saurik.com Git - apple/xnu.git/blobdiff - libkern/zlib/intel/inffastS.s
xnu-1699.22.73.tar.gz
[apple/xnu.git] / libkern / zlib / intel / inffastS.s
diff --git a/libkern/zlib/intel/inffastS.s b/libkern/zlib/intel/inffastS.s
new file mode 100644 (file)
index 0000000..4252121
--- /dev/null
@@ -0,0 +1,1179 @@
+#if (defined __i386__)
+
+/* this assembly was 1st compiled from inffast.c (assuming POSTINC defined, OFF=0) and then hand optimized */
+
+       .cstring
+LC0:
+       .ascii "invalid distance too far back\0"
+LC1:
+       .ascii "invalid distance code\0"
+LC2:
+       .ascii "invalid literal/length code\0"
+       .text
+       .align 4,0x90
+
+
+#ifdef  INFLATE_STRICT
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+#endif
+.globl _inflate_fast
+_inflate_fast:
+
+       // set up ebp to refer to arguments strm and start
+       pushl   %ebp
+       movl    %esp, %ebp
+
+       // push edi/esi/ebx into stack
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebx
+
+       // allocate for local variables 92-12=80, + 12 to align %esp to 16-byte boundary
+       subl    $92, %esp
+       movl    8(%ebp), %ebx                                   
+
+       /* definitions to help code readability */
+
+       #define bits    %edi
+       #define strm    %ebx
+       #define state   28(strm)                // state = (struct inflate_state FAR *)strm->state;     
+       #define in              -84(%ebp)               // in = strm->next_in - OFF; OFF=0
+       #define last    -80(%ebp)               // last = in + (strm->avail_in - 5);
+       #define out             -28(%ebp)               // out = strm->next_out - OFF;
+       #define beg             -76(%ebp)               // beg = out - (start - strm->avail_out);
+       #define end             -72(%ebp)               // end = out + (strm->avail_out - 257);
+       #define wsize   -68(%ebp)               // wsize = state->wsize;
+       #define whave   -64(%ebp)               // whave = state->whave;
+       #define write   -60(%ebp)               // write = state->write;
+       #define window  -56(%ebp)               // window = state->window;
+       #define hold    -52(%ebp)               // hold = state->hold;
+       #define lcode   -48(%ebp)               // lcode = state->lencode;
+       #define dcode   -44(%ebp)               // dcode = state->distcode;
+       #define lmask   -40(%ebp)               // lmask = (1U << state->lenbits) - 1; 
+       #define dmask   -36(%ebp)               // dmask = (1U << state->distbits) - 1; 
+       #define len             -32(%ebp)
+       #define dmax    -20(%ebp)               
+       #define dist    -16(%ebp)               // dist
+       #define write_wsize     -24(%ebp)       // write+wsize
+       #define write_1         -88(%ebp)       // write-1
+       #define op              -92(%ebp)               // op
+
+       movl    (strm), %eax                    // strm->next_in
+       movl    %eax, in                                // in = strm->next_in - OFF; OFF=0
+
+       subl    $5, %eax                                // in - 5;
+       movl    4(strm), %ecx                   // strm->avail_in
+       addl    %ecx, %eax                              // in + (strm->avail_in - 5);
+       movl    %eax, last                              // last = in + (strm->avail_in - 5);
+
+       movl    12(strm), %esi                  // strm->next_out
+       movl    %esi, out                               // out = strm->next_out - OFF;
+
+       movl    16(strm), %ecx                  // strm->avail_out
+       movl    %esi, %eax                              // out          
+       subl    12(%ebp), %eax                  // out - start
+       addl    %ecx, %eax                              // out - (start - strm->avail_out);
+       movl    %eax, beg                               // beg = out - (start - strm->avail_out);
+
+       leal    -257(%esi,%ecx), %ecx   // out + (strm->avail_out - 257);
+       movl    %ecx, end                               // end = out + (strm->avail_out - 257);
+
+       movl    state, %edx
+
+#ifdef INFLATE_STRICT
+       movl    20(%edx), %ecx                  // state->dmax
+       movl    %ecx, dmax                              // dmax = state->dmax;
+#endif
+
+       movl    40(%edx), %ecx                  // state->wsize
+       movl    %ecx, wsize                             // wsize = state->wsize;
+
+       movl    44(%edx), %ecx                  // state->whave
+       movl    %ecx, whave                             // whave = state->whave;
+
+       movl    48(%edx), %esi                  // state->write
+       movl    %esi, write                             // write = state->write;
+
+       movl    52(%edx), %eax                  // state->window
+       movl    %eax, window                    // window = state->window;
+
+
+       movl    56(%edx), %ecx                  // state->hold
+       movl    %ecx, hold                              // hold = state->hold
+
+       movl    60(%edx), bits                  // bits = state->bits;
+
+       movl    76(%edx), %esi                  // state->lencode
+       movl    %esi, lcode                             // lcode = state->lencode;
+
+       movl    80(%edx), %eax                  // state->distcode
+       movl    %eax, dcode                             // dcode = state->distcode;
+
+       movl    84(%edx), %ecx                  // state->lenbits
+       movl    $1, %eax
+       movl    %eax, %esi                              // a copy of 1
+       sall    %cl, %esi                               // 1 << state->lenbits
+       decl    %esi                                    // (1U << state->lenbits) - 1;
+       movl    %esi, lmask                             // lmask = (1U << state->lenbits) - 1;
+
+       movl    88(%edx), %ecx                  // state->distbits
+       sall    %cl, %eax                               // 1 << state->distbits
+       decl    %eax                                    // (1U << state->distbits) - 1;
+       movl    %eax, dmask                             // dmask = (1U << state->distbits) - 1;
+
+
+       // these 2 might be used often, precomputed and saved in stack  
+       movl    write, %eax
+       addl    wsize, %eax
+       movl    %eax, write_wsize               // write+wsize
+
+       movl    write, %edx
+       decl    %edx
+       movl    %edx, write_1                   // write-1
+
+
+L_do_while_loop:                                               // do {
+
+       cmpl    $15, bits
+       jae             bits_ge_15                                      //              if (bits < 15) {
+#if 0
+       leal    8(bits), %esi                           // esi = bits+8
+       movl    in, %eax                                        // eax = in
+       movzbl  (%eax), %edx                            // edx = *in++
+       movl    bits, %ecx                                      // cl = bits
+       sall    %cl, %edx                                       // 1st *in << bits
+       addl    hold, %edx                                      // hold += 1st *in << bits
+       movzbl  1(%eax), %eax                           // 2nd *in
+       movl    %esi, %ecx                                      // cl = bits+8
+       sall    %cl, %eax                                       // 2nd *in << (bits+8)
+       addl    %eax, %edx                                      // hold += 2nd *in << (bits+8) 
+       movl    %edx, hold                                      // update hold
+       addl    $2, in                                          // in += 2
+       addl    $16, bits                                       // bits += 16;
+#else
+       /* from simulation, this code segment performs better than the other case
+               possibly, we are more often hit with aligned memory access */
+       movl    in, %ecx                                        //                      unsigned short *inp = (unsigned short *) (in+OFF);
+       movzwl  (%ecx), %eax                            //                      *((unsigned short *) in);
+       movl    bits, %ecx                                      //                      bits
+       sall    %cl, %eax                                       //                      *((unsigned short *) in) << bits
+       addl    %eax, hold                                      //                      hold += (unsigned long) *((unsigned short *) in) << bits;
+       addl    $2, in                                          //                      in += 2;
+       addl    $16, bits                                       //                      bits += 16;
+#endif
+
+bits_ge_15:                                                            //              }       /* bits < 15 */
+
+       movl    hold, %eax                                      //              hold
+       andl    lmask, %eax                                     //              hold & lmask;
+       movl    lcode, %esi                                     //              lcode[] : 4-byte aligned
+       movl    (%esi,%eax,4), %eax                     //              this = lcode[hold&lmask];
+       jmp             dolen
+       .align 4,0x90
+op_nonzero:
+       movzbl  %al, %ecx                                       // a copy of op to cl
+       testb   $16, %cl                                        // if op&16
+       jne             Llength_base                            //              branch to length_base
+
+       testb   $64, %cl                                        // elif op&64
+       jne             length_2nd_level_else           //              branch to 2nd level length code else conditions
+
+       // 2nd level length code
+
+       movl    $1, %eax
+       sall    %cl, %eax                                       // 1 << op
+       decl    %eax                                            // ((1<<op) - 1)
+       andl    hold, %eax                                      // hold & ((1U << op) - 1)
+       movzwl  %si, %ecx                                       // this.val
+       addl    %ecx, %eax                                      // this.val + (hold & ((1U << op) - 1))
+
+       movl    lcode, %ecx                                     // lcode[] : 4-byte aligned
+       movl    (%ecx,%eax,4), %eax                     // this = lcode[this.val + (hold & ((1U << op) - 1))];
+                                                                               // goto dolen (compiler rearranged the order of code)
+dolen:
+       movl    %eax, %esi                                      // make a copy of this (val 16-bit, bits 8-bit, op 8-bit)
+       shrl    $16, %esi                                       // %esi = this.val;
+       movzbl  %ah, %ecx                                       // op = (unsigned)(this.bits); 
+       shrl    %cl, hold                                       // hold >>= op; 
+       subl    %ecx, bits                                      // bits -= op;
+       testb   %al, %al                                        // op = (unsigned)(this.op);
+       jne             op_nonzero                                      // if op!=0, branch to op_nonzero 
+
+       movl    %esi, %ecx                                      // this.val;
+       movl    out, %eax                                       // out
+       movb    %cl, (%eax)                                     // PUP(out) = (unsigned char)(this.val);
+       incl    %eax                                            // out++;
+       movl    %eax, out                                       // save out
+
+L_tst_do_while_loop_end:
+       movl    last, %eax                                      // last
+       cmpl    %eax, in                                        // in vs last
+       jae             return_unused_bytes             // branch to return_unused_bytes if in >= last
+       movl    end, %edx                                       // end
+       cmpl    %edx, out                                       // out vs end
+       jb              L_do_while_loop                         // branch to do loop if out < end
+
+return_unused_bytes:
+
+       movl    bits, %eax                                      // bits
+       shrl    $3, %eax                                        // len = bits >> 3
+       movl    in, %edx                                        // in
+       subl    %eax, %edx                                      // in -= len
+       sall    $3, %eax                                        // len << 3
+       movl    bits, %ecx                                      // bits
+       subl    %eax, %ecx                                      // bits -= len << 3
+
+       movl    %edx, (strm)                            // strm->next_in = in + OFF;
+       movl    out, %eax
+       movl    %eax, 12(strm)                          // strm->next_out = out + OFF;
+
+       cmpl    %edx, last                                      // last vs in
+       jbe             L67                                                     // if (last <= in) branch to L67 and return to L69
+       movl    last, %eax                                      // last
+       addl    $5, %eax                                        // 5 + last
+       subl    %edx, %eax                                      // 5 + last - in        
+L69:
+       movl    %eax, 4(strm)                           // update strm->avail_in
+
+       movl    end, %eax
+       cmpl    %eax, out                                       // out vs end
+       jae             L70                                                     // if (out>=end) branch to L70, and return to L72
+       addl    $257, %eax                                      // 257 + end
+       subl    out, %eax                                       // 257 + end - out
+L72:
+       movl    %eax, 16(strm)                          // update strm->avail_out
+
+       movl    $1, %eax
+       sall    %cl, %eax                                       // 1 << bits
+       decl    %eax                                            // (1 << bits) -1
+       andl    hold, %eax                                      // hold &= (1U << bits) - 1;
+       movl    state, %esi
+       movl    %eax, 56(%esi)                          // state->hold = hold;
+       movl    %ecx, 60(%esi)                          // state->bits = bits;
+
+       addl    $92, %esp                                       // pop out local from stack
+
+       // restore saved registers and return
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       leave
+       ret
+
+       // this code segment is branched in from op_nonzero, with op in cl and this.value in esi
+Llength_base:
+       movzwl  %si, %esi                       // this instruction might not be needed, pad here to give better performance
+       movl    %esi, len                       // len = (unsigned)(this.val);
+       movl    %ecx, %esi                      // leave a copy of op at ecx
+       andl    $15, %esi                       // op&=15;
+       je              Lop_is_zero                     // if (op) {
+       cmpl    bits, %esi                      //              op vs bits
+       jbe             Lop_be_bits                     //              if (bits < op) {
+       movl    in, %edx                        //                      in
+       movzbl  (%edx), %eax            //                      *in
+       movl    bits, %ecx                      //                      bits
+       sall    %cl, %eax                       //                      *in << bits
+       addl    %eax, hold                      //                      hold += (unsigned long)(PUP(in)) << bits;
+       incl    %edx                            //                      in++
+       movl    %edx, in                        //                      update in
+       addl    $8, bits                        //                      bits += 8
+Lop_be_bits:                                   //              }
+       movl    $1, %eax                        //              1
+       movl    %esi, %ecx                      //              op
+       sall    %cl, %eax                       //              1 << op
+       decl    %eax                            //              (1<<op)-1       
+       andl    hold, %eax                      //              hold & ((1U << op) - 1)
+       addl    %eax, len                       //              len += (unsigned)hold & ((1U << op) - 1);
+       shrl    %cl, hold                       //              hold >>= op;
+       subl    %esi, bits                      //              bits -= op;
+Lop_is_zero:                                   // }
+       cmpl    $14, bits                       // if (bits < 15) {
+       jbe             bits_le_14                      //              branch to refill 16-bit into hold, and branch back to next
+L19:                                                   // }
+       movl    hold, %eax                      // hold
+       andl    dmask, %eax                     // hold&dmask
+       movl    dcode, %esi                     // dcode[] : 4-byte aligned
+       movl    (%esi,%eax,4), %eax     // this = dcode[hold & dmask];
+       jmp             dodist
+
+Lop_16_zero:
+       testb   $64, %cl                                        // op&64
+       jne             Linvalid_distance_code          // if (op&64)!=0, branch to invalid distance code
+       movl    $1, %eax                                        // 1
+       sall    %cl, %eax                                       // (1<<op)
+       decl    %eax                                            // (1<<op)-1 
+       andl    hold, %eax                                      // (hold & ((1U << op) - 1))
+       movzwl  %dx, %edx                                       // this.val
+       addl    %edx, %eax                                      // this.val + (hold & ((1U << op) - 1))
+       movl    dcode, %edx                                     // dcode[] : 4 byte aligned
+       movl    (%edx,%eax,4), %eax                     // this = dcode[this.val + (hold & ((1U << op) - 1))];
+dodist:
+       movl    %eax, %edx                                      // this : (val 16-bit, bits 8-bit, op 8-bit)
+       shrl    $16, %edx                                       // edx = this.val
+       movzbl  %ah, %ecx                                       // op = (unsigned)(this.bits); 
+       shrl    %cl, hold                                       // hold >>= op;
+       subl    %ecx, bits                                      // bits -= op;
+       movzbl  %al, %ecx                                       // op = (unsigned)(this.op);
+       testb   $16, %cl                                        // op & 16
+       je              Lop_16_zero                                     // if (op&16)==0 goto test op&64
+
+Ldistance_base:                                                        // if (op&16) {         /* distance base */
+       andl    $15, %ecx                                       //        op &= 15; edx = dist = this.val;
+       movl    %ecx, op                                        //              save a copy of op
+       cmpl    bits, %ecx                                      //              op vs bits
+       jbe             0f                                                      //              if (bits < op) {
+       movl    in, %ecx                                        //                      in
+       movzbl  (%ecx), %eax                            //                      *in
+       movl    bits, %ecx                                      //                      bits
+       sall    %cl, %eax                                       //                      *in << bits
+       addl    %eax, hold                                      //                      hold += (unsigned long)(PUP(in)) << bits;
+       incl    in                                                      //                      in++
+       addl    $8, bits                                        //                      bits += 8
+       cmpl    bits, op                                        //                      op vs bits
+       jbe             0f                                                      //                      if (bits < op) {
+       movl    in, %esi                                        //                              i
+       movzbl  (%esi), %eax                            //                              *in
+       movl    bits, %ecx                                      //                              cl = bits
+       sall    %cl, %eax                                       //                              *in << bits
+       addl    %eax, hold                                      //                              hold += (unsigned long)(PUP(in)) << bits;
+       incl    %esi                                            //                              in++
+       movl    %esi, in                                        //                              update in
+       addl    $8, bits                                        //                              bits += 8
+0:                                                                             // }            }
+
+       movzwl  %dx, %edx                                       // dist = (unsigned)(this.val); 
+       movl    $1, %eax                                        // 1
+       movzbl  op, %ecx                                        // cl = op
+       sall    %cl, %eax                                       // 1 << op
+       decl    %eax                                            // ((1U << op) - 1)
+       andl    hold, %eax                                      // (unsigned)hold & ((1U << op) - 1)
+       addl    %edx, %eax                                      // dist += (unsigned)hold & ((1U << op) - 1);
+
+#ifdef INFLATE_STRICT
+
+       cmpl    dmax, %eax                                              // dist vs dmax
+       ja              Linvalid_distance_too_far_back  // if (dist > dmax) break for invalid distance too far back     
+
+#endif
+
+       movl    %eax, dist                                              // save a copy of dist in stack
+       shrl    %cl, hold                                               // hold >>= op; 
+       subl    %ecx, bits                                              // bits -= op;
+
+       movl    out, %eax
+       subl    beg, %eax                                               // eax = op = out - beg
+       cmpl    %eax, dist                                              // dist vs op
+       jbe             Lcopy_direct_from_output                // if (dist <= op) branch to copy direct from output    
+
+                                                                                       // if (dist > op) {
+       movl    dist, %ecx                                              //      dist
+       subl    %eax, %ecx                                              //      esi = op = dist - op;
+       cmpl    %ecx, whave                                             //  whave vs op
+       jb              Linvalid_distance_too_far_back  //  if (op > whave) break for error;
+
+       movl    write, %edx
+       testl   %edx, %edx
+       jne             Lwrite_non_zero                                 // if (write==0) {
+       movl    wsize, %eax                                             //              wsize
+       subl    %ecx, %eax                                              //              wsize-op
+       movl    window, %esi                                    //              from=window-OFF
+       addl    %eax, %esi                                              //              from += wsize-op
+       movl    out, %edx                                               //              out
+       cmpl    %ecx, len                                               //              len vs op
+       jbe             L38                                                             //              if !(op < len) skip
+    subl    %ecx, len                                          // len - op
+0:                                                                                     // do {
+       movzbl  (%esi), %eax                                    //
+    movb    %al, (%edx)                                                //      
+    incl    %edx                                                       //
+    incl    %esi                                                       //      PUP(out) = PUP(from);
+    decl    %ecx                                                       //              --op;
+    jne     0b                                                         // } while (op);
+
+    movl    %edx, out                                          // update out
+    movl    %edx, %esi                                         // out 
+    subl    dist, %esi                                         // esi = from = out - dist;
+
+L38:                   /* copy from output */
+
+                       //              while (len > 2) {
+            //            PUP(out) = PUP(from);
+            //            PUP(out) = PUP(from);
+            //            PUP(out) = PUP(from);
+            //            len -= 3;
+            //        }
+            //        if (len) {
+            //            PUP(out) = PUP(from);
+            //            if (len > 1)
+            //                PUP(out) = PUP(from);
+            //       }
+
+       movl    len, %ecx                                               // len
+       movl    out, %edx                                               // out
+       subl    $3, %ecx                                                // pre-decrement len by 3
+       jl              1f                                                              // if len < 3, branch to 1f for remaining processing
+0:                                                                                     // while (len>2) {
+       movzbl  (%esi), %eax
+       movb    %al, (%edx)                                             //              PUP(out) = PUP(from);
+       movzbl  1(%esi), %eax
+       movb    %al, 1(%edx)                                    //              PUP(out) = PUP(from);
+       movzbl  2(%esi), %eax
+       movb    %al, 2(%edx)                                    //              PUP(out) = PUP(from);
+       addl    $3, %esi                                                //              from += 3;
+       addl    $3, %edx                                                //              out += 3;
+       subl    $3, %ecx                                                //              len -= 3;
+       jge             0b                                                              // }
+       movl    %edx, out                                               // update out, in case len == 0
+1:
+       addl    $3, %ecx                                                // post-increment len by 3
+       je              L_tst_do_while_loop_end                 // if (len) {
+       movzbl  (%esi), %eax                                    //
+       movb    %al, (%edx)                                             //              PUP(out) = PUP(from);
+       incl    %edx                                                    //              out++
+       movl    %edx, out                                               //              update out, in case len == 1
+       cmpl    $2, %ecx                                                //
+       jne             L_tst_do_while_loop_end                 //              if len==1, break
+       movzbl  1(%esi), %eax
+       movb    %al, (%edx)                                             //              PUP(out) = PUP(from);
+       incl    %edx                                                    //              out++
+       movl    %edx, out                                               //              update out
+       jmp             L_tst_do_while_loop_end                 //      }
+       
+       .align 4,0x90
+length_2nd_level_else:
+       andl    $32, %ecx                                               // test end-of-block
+       je              invalid_literal_length_code             // if (op&32)==0, branch for invalid literal/length code break
+       movl    state, %edx                                             // if (op&32), end-of-block is detected
+       movl    $11, (%edx)                                             // state->mode = TYPE
+       jmp             return_unused_bytes
+
+L70:
+       movl    out, %edx                                               // out
+       subl    %edx, end                                               // (end-out)
+       movl    end, %esi                                               // %esi = (end-out) = -(out - end);
+       leal    257(%esi), %eax                                 // %eax = 257 + %esi = 257 - (out -end)
+       jmp             L72                                                             // return to update state and return
+
+L67:                                                                           // %edx = in, to return 5 - (in - last) in %eax
+       subl    %edx, last                                              // last - in 
+       movl    last, %edx                                              // %edx = last - in = - (in - last);
+       leal    5(%edx), %eax                                   // %eax = 5 + %edx = 5 - (in - last);
+       jmp             L69                                                             // return to update state and return
+
+bits_le_14:
+#if 1
+       leal    8(bits), %esi                           // esi = bits+8
+       movl    in, %eax                                        // eax = in
+       movzbl  (%eax), %edx                            // edx = *in++
+       movl    bits, %ecx                                      // cl = bits
+       sall    %cl, %edx                                       // 1st *in << bits
+       addl    hold, %edx                                      // hold += 1st *in << bits
+       movzbl  1(%eax), %eax                           // 2nd *in
+       movl    %esi, %ecx                                      // cl = bits+8
+       sall    %cl, %eax                                       // 2nd *in << (bits+8)
+       addl    %eax, %edx                                      // hold += 2nd *in << (bits+8) 
+       movl    %edx, hold                                      // update hold
+       addl    $2, in                                          // in += 2
+       addl    $16, bits                                       // bits += 16;
+       jmp     L19
+#else
+       /* this code segment does not run as fast as the other original code segment, possibly the processor
+               need extra time to handle unaligned short access */
+       movl    in, %edx                    //          unsigned short *inp = (unsigned short *) (in+OFF);
+    movzwl  (%edx), %eax                //          *((unsigned short *) in);
+    movl    bits, %ecx                  //          bits
+    sall    %cl, %eax                   //          *((unsigned short *) in) << bits
+    addl    %eax, hold                  //          hold += (unsigned long) *((unsigned short *) in) << bits;
+    addl    $2, %edx                    //          in += 2;
+    addl    $16, %ecx                   //          bits += 16;
+       movl    %edx, in
+       movl    %ecx, bits
+       jmp     L19
+#endif
+invalid_literal_length_code:
+    call    0f
+0:     popl    %eax
+       leal    LC2-0b(%eax), %eax
+       movl    %eax, 24(strm)
+       movl    state, %esi
+       movl    $27, (%esi)
+       jmp             return_unused_bytes
+Linvalid_distance_code:
+    call    0f
+0:     popl    %eax
+       leal    LC1-0b(%eax), %eax
+       movl    %eax, 24(strm)
+       movl    state, %eax
+       movl    $27, (%eax)
+       jmp             return_unused_bytes
+
+#ifdef INFLATE_STRICT
+       .align  4,0x90
+       .byte   0
+       .byte   0
+       .byte   0
+       .byte   0
+       .byte   0
+       .byte   0
+       .byte   0
+       .byte   0
+       .byte   0
+#endif
+Lcopy_direct_from_output:
+       movl    out, %edx                                                       // out
+       subl    dist, %edx                                                      // from = out - dist
+       movl    out, %ecx                                                       // out
+       movl    len, %esi                                                       // len
+       subl    $3, %esi                                                        // pre-decement len by 3
+0:                                                                                             // do {
+       movzbl  (%edx), %eax
+       movb    %al, (%ecx)                                                     //      PUP(out) = PUP(from);
+       movzbl  1(%edx), %eax
+       movb    %al, 1(%ecx)                                            //      PUP(out) = PUP(from);
+       movzbl  2(%edx), %eax
+       movb    %al, 2(%ecx)                                            //      PUP(out) = PUP(from);
+       addl    $3, %edx                                                        //      from += 3
+       addl    $3, %ecx                                                        //      out += 3        
+       subl    $3, %esi                                                        //      len -= 3
+       jge             0b                                                                      // } while (len > 2);
+       movl    %ecx, out                                                       // update out in case len == 0
+       addl    $3, %esi                                                        // post-increment len by 3
+       je              L_tst_do_while_loop_end                         // if (len) {
+       movzbl  (%edx), %eax
+       movb    %al, (%ecx)                                                     //              PUP(out) = PUP(from);
+       incl    %ecx
+       movl    %ecx, out                                                       //              out++
+       cmpl    $2, %esi                                                        //
+       jne             L_tst_do_while_loop_end                         //              if (len>2)
+       movzbl  1(%edx), %eax
+       movb    %al, (%ecx)                                                     //                      PUP(out) = PUP(from);
+       incl    %ecx
+       movl    %ecx, out                                                       //                      out++
+       jmp             L_tst_do_while_loop_end                         // }
+
+       .align 4,0x90
+Lwrite_non_zero:                                                               // %edx = write, %ecx = op
+       movl    window, %esi                                            // from = window - OFF;
+       cmp             %ecx, %edx                                                      // write vs op, test for wrap around window or contiguous in window
+       jae             Lcontiguous_in_window                           // if (write >= op) branch to contiguous in window 
+
+Lwrap_around_window:                                                   // wrap around window
+       addl    write_wsize, %esi                                       // from += write+wsize
+       subl    %ecx, %esi                                                      // from += wsize + write - op;          
+       subl    %edx, %ecx                                                      // op -= write
+       cmpl    %ecx, len                                                       // len vs op
+       jbe             L38                                                                     // if (len <= op) break to copy from output
+       subl    %ecx, len                                                       // len -= op;
+       movl    out, %edx                                                       // out
+0:                                                                                             // do {
+       movzbl  (%esi), %eax                                            //      *from
+       movb    %al, (%edx)                                                     //      *out
+       incl    %esi                                                            //      from++
+       incl    %edx                                                            //      out++   
+       decl    %ecx                                                            //      --op
+       jne             0b                                                                      // } while (op);
+
+       movl    %edx, out                                                       // save out in case we need to break to L38
+       movl    window, %esi                                            // from = window - OFF;
+       movl    len, %eax                                                       // len
+       cmpl    %eax, write                                                     // write vs len
+       jae             L38                                                                     // if (write >= len) break to L38 
+
+       movl    write, %ecx                                                     // op = write
+       subl    %ecx, len                                                       // len -= op;
+0:                                                                                             // do {
+       movzbl  (%esi), %eax                                            //      *from
+       movb    %al, (%edx)                                                     //  *out
+       incl    %esi                                                            //  from++
+       incl    %edx                                                            //      out++
+       decl    %ecx                                                            //  --op
+       jne             0b                                                                      // } while (op);
+
+       movl    %edx, %esi                                                      // from = out
+       movl    %edx, out                                                       // save a copy of out
+       subl    dist, %esi                                                      // from = out - dist;
+       jmp             L38                                                                     // break to copy from output
+
+Lcontiguous_in_window:                                                         // contiguous in window, edx = write, %ecx = op
+       subl    %ecx, %edx                                                              // write - op
+       addl    %edx, %esi                                                              // from += write - op;
+       cmpl    %ecx, len                                                               // len vs op
+       jbe             L38                                                                             // if (len <= op) break to copy from output 
+       movl    out, %edx                                                               // out
+       subl    %ecx, len                                                               // len -= op;
+
+0:                                                                                                     // do {
+       movzbl  (%esi), %eax                                                    //      *from
+       movb    %al, (%edx)                                                             //      *out
+       incl    %esi                                                                    //      from++
+       incl    %edx                                                                    //      out++
+       decl    %ecx                                                                    //      op-- 
+       jne             0b                                                                              // } while (op); 
+
+       movl    %edx, out                                                               // update out
+       movl    %edx, %esi                                                              // from = out
+       subl    dist, %esi                                                              // from = out - dist;
+       jmp             L38
+
+Linvalid_distance_too_far_back:
+    call    0f
+0:     popl    %eax
+       leal    LC0-0b(%eax), %eax
+       movl    %eax, 24(strm)
+       movl    state, %ecx
+       movl    $27, (%ecx)
+       jmp             return_unused_bytes
+
+#endif
+
+#if (defined __x86_64__)
+       .cstring
+LC0:
+       .ascii "invalid distance too far back\0"
+LC1:
+       .ascii "invalid distance code\0"
+LC2:
+       .ascii "invalid literal/length code\0"
+       .text
+       .align 4,0x90
+
+#ifdef  INFLATE_STRICT
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+#endif
+
+.globl _inflate_fast
+_inflate_fast:
+
+       // set up rbp
+       pushq   %rbp
+       movq    %rsp, %rbp
+
+       // save registers in stack
+       pushq   %r15
+       pushq   %r14
+       pushq   %r13
+       pushq   %r12
+       pushq   %rbx
+
+       #define strm            %r13
+       #define state           %rdi
+       #define in                      %r12
+       #define in_d            %r12d
+       #define out                     %r10
+       #define out_d           %r10d
+       #define write           %r15d
+       #define hold            %r9
+       #define holdd           %r9d
+       #define bits            %r8d
+       #define lcode           %r14
+       #define len                     %ebx
+       #define from            %rcx
+       #define dmax            %r11d
+
+       #define last            -104(%rbp)
+       #define beg                     -96(%rbp)
+       #define end                     -88(%rbp)
+       #define wsize           -80(%rbp)
+       #define whave           -76(%rbp)
+       #define window          -72(%rbp)
+       #define dcode           -64(%rbp)
+       #define lmask           -56(%rbp)
+       #define dmask           -112(%rbp)
+       #define wsize_write     -116(%rbp)
+       #define write_1         -128(%rbp)
+       #define dist            -44(%rbp)
+
+       // reserve stack memory for local variables 128-40=88
+       subq    $88, %rsp
+
+       movq    %rdi, strm
+       movq    56(%rdi), state                                         // state = (struct inflate_state FAR *)strm->state;     
+       movq    (strm), in                                                      // in = strm->next_in - OFF;
+       movl    8(strm), %eax                                           // strm->avail_in
+       subl    $5, %eax                                                        // (strm->avail_in - 5)
+       addq    in, %rax                                                        // in + (strm->avail_in - 5)
+       movq    %rax, last                                                      // last = in + (strm->avail_in - 5)
+       movq    24(strm), out                                           // out = strm->next_out
+       movl    32(strm), %eax                                          // strm->avail_out
+       subl    %eax, %esi                                                      // (start - strm->avail_out);
+       movq    out, %rdx                                                       // strm->next_out
+       subq    %rsi, %rdx                                                      // out - (start - strm->avail_out); 
+       movq    %rdx, beg                                                       // beg = out - (start - strm->avail_out);
+       subl    $257, %eax                                                      // (strm->avail_out - 257)
+       addq    out, %rax                                                       // out + (strm->avail_out - 257); 
+       movq    %rax, end                                                       // end = out + (strm->avail_out - 257);
+
+#ifdef INFLATE_STRICT
+       movl    20(state), dmax                                         // dmax = state->dmax;
+#endif
+
+       movl    52(state), %ecx                                         // state->wsize
+       movl    %ecx, wsize                                                     // wsize = state->wsize;
+       movl    56(state), %ebx                                         // state->whave;
+       movl    %ebx, whave                                                     // whave = state->whave;
+       movl    60(state), write                                        // write = state->write;
+       movq    64(state), %rax                                         // state->window
+       movq    %rax, window                                            // window = state->window;
+       movq    72(state), hold                                         // hold = state->hold;
+       movl    80(state), bits                                         // bits = state->bits;
+
+       movq    96(state), lcode                                        // lcode = state->lencode;
+       movq    104(state), %rdx                                        // state->distcode;
+       movq    %rdx, dcode                                                     // dcode = state->distcode;
+
+       movl    116(state), %ecx                                        // state->distbits
+       movl    $1, %eax
+       movl    %eax, %edx                                                      // 1
+       sall    %cl, %edx                                                       // (1U << state->distbits)
+       movl    112(state), %ecx                                        // state->lenbits
+       sall    %cl, %eax                                                       // (1U << state->lenbits)
+       decl    %eax                                                            // (1U << state->lenbits) - 1
+       movq    %rax, lmask                                                     // lmask = (1U << state->lenbits) - 1
+       decl    %edx                                                            // (1U << state->distbits) - 1
+       movq    %rdx, dmask                                                     // dmask = (1U << state->distbits) - 1
+
+       movl    wsize, %ecx                                                     // wsize
+       addl    write, %ecx                                                     // wsize + write
+       movl    %ecx, wsize_write                                       // wsize_write = wsize + write
+
+       leal    -1(%r15), %ebx                                          // write - 1
+       movq    %rbx, write_1                                           // write_1 = write - 1
+
+L_do_while_loop:
+       cmpl    $14, bits                                                       // bits vs 14
+       ja              0f                                                                      // if (bits < 15) {
+       movzwl  (in), %eax                                                      //              read 2 bytes from in
+       movl    bits, %ecx                                                      //              set up cl = bits
+       salq    %cl, %rax                                                       //              (*in) << bits   
+       addq    %rax, hold                                                      //              hold += (*in) << bits
+       addq    $2, in                                                          //              in += 2
+       addl    $16, bits                                                       //              bits += 16
+0:                                                                                             // }
+       movq    lmask, %rax                                                     //      lmask
+       andq    hold, %rax                                                      //      hold & lmask
+       jmp             1f
+       .align 4,0x90
+Lop_nonzero:
+       movzbl  %al, %ecx                                                       // op in al and cl 
+       testb   $16, %cl                                                        // check for length base processing (op&16)
+       jne             L_length_base                                           // if (op&16) branch to length base processing  
+       testb   $64, %cl                                                        // check for 2nd level length code (op&64==0)
+       jne             L_end_of_block                                          // if (op&64)!=0, branch for end-of-block processing
+
+       /* 2nd level length code : (op&64) == 0*/
+L_2nd_level_length_code:
+       movl    $1, %eax                                                        // 1
+       sall    %cl, %eax                                                       // 1 << op
+       decl    %eax                                                            // ((1U << op) - 1)
+       andq    hold, %rax                                                      // (hold & ((1U << op) - 1))
+       movzwl  %dx, %edx
+       addq    %rdx, %rax                                                      // this = lcode[this.val + (hold & ((1U << op) - 1))];
+1:     
+       movl    (lcode,%rax,4), %eax                            // this = lcode[hold & lmask];
+Ldolen:
+       movl    %eax, %edx                                                      // a copy of this
+       shrl    $16, %edx                                                       // edx = this.val;
+       movzbl  %ah, %ecx                                                       // op = this.bits
+       shrq    %cl, hold                                                       // hold >>= op; 
+       subl    %ecx, bits                                                      // bits -= op;
+       testb   %al, %al                                                        // op = (unsigned)(this.op);
+       jne             Lop_nonzero                                                     // if (op!-0) branch for copy operation
+L_literal:
+       movb    %dl, (out)                                                      // *out = this.val
+       incq    out                                                                     // out ++
+L_do_while_loop_check:
+       cmpq    last, in                                                        // in vs last
+       jae             L_return_unused_byte                            // if in >= last, break to return unused byte processing
+       cmpq    end, out                                                        // out vs end
+       jb              L_do_while_loop                                         // back to do_while_loop if out < end
+
+       /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+
+L_return_unused_byte:
+       movl    out_d, %esi
+       jmp             L34
+
+L_length_base:                         /* al = cl = op, edx = this.val, op&16 = 16 */ 
+       movzwl  %dx, len                                                        // len = (unsigned)(this.val);
+       movl    %ecx, %edx                                                      // op
+       andl    $15, %edx                                                       // op &= 15;
+       je              1f                                                                      // if (op) {
+       cmpl    bits, %edx                                                      //              op vs bits
+       jbe             0f                                                                      //              if (bits < op) {
+       movzbl  (in), %eax                                                      //                      *in
+       movl    bits, %ecx                                                      //                      cl = bits
+       salq    %cl, %rax                                                       //                      *in << bits
+       addq    %rax, hold                                                      //                      hold += (unsigned long)(PUP(in)) << bits;
+       incq    in                                                                      //                      in++
+       addl    $8, bits                                                        //                      bits += 8
+0:                                                                                             //              }
+       movl    $1, %eax                                                        //              1
+       movl    %edx, %ecx                                                      //              cl = op
+       sall    %cl, %eax                                                       //              1 << op
+       decl    %eax                                                            //              (1 << op) - 1
+       andl    holdd, %eax                                                     //               (unsigned)hold & ((1U << op) - 1);
+       addl    %eax, len                                                       //              len += (unsigned)hold & ((1U << op) - 1);
+       shrq    %cl, hold                                                       //              hold >>= op;
+       subl    %edx, bits                                                      //              bits -= op;
+1:                                                                                             // }
+       cmpl    $14, bits                                                       // bits vs 14
+       jbe             L99                                                                     // if (bits < 15) go to loading to hold and return to L19
+L19:                                                                                           // }
+       movq    dmask, %rax                                                     // dmask
+       andq    hold, %rax                                                      // hold & dmask
+       movq    dcode, %rdx                                                     // dcode[]
+       movl    (%rdx,%rax,4), %eax                                     // this = dcode[hold & dmask];
+       jmp             L_dodist
+       .align 4,0x90
+0:                                                                                             // op&16 == 0, test (op&64)==0 for 2nd level distance code
+       testb   $64, %cl                                                        // op&64        
+       jne             L_invalid_distance_code                         // if ((op&64)==0) { /* 2nd level distance code */
+       movl    $1, %eax                                                        //      1
+       sall    %cl, %eax                                                       //  1 << op 
+       decl    %eax                                                            // (1 << op) - 1
+       andq    hold, %rax                                                      // (hold & ((1U << op) - 1))
+       movzwl  %dx, %edx                                                       // this.val     
+       addq    %rdx, %rax                                                      // this.val + (hold & ((1U << op) - 1))
+       movq    dcode, %rcx                                                     // dcode[]
+       movl    (%rcx,%rax,4), %eax                                     // this = dcode[this.val + (hold & ((1U << op) - 1))];
+L_dodist:
+       movl    %eax, %edx                                                      // this
+       shrl    $16, %edx                                                       // dist = (unsigned)(this.val);
+       movzbl  %ah, %ecx                                                       // cl = op = this.bits
+       shrq    %cl, hold                                                       // hold >>= op;
+       subl    %ecx, bits                                                      // bits -= op;
+       movzbl  %al, %ecx                                                       // op = (unsigned)(this.op);
+       testb   $16, %cl                                                        // (op & 16)    test for distance base
+       je              0b                                                                      // if (op&16) == 0, branch to check for 2nd level distance code
+
+L_distance_base:                                                               /* distance base */
+
+       movl    %ecx, %esi                                                      // op
+       andl    $15, %esi                                                       // op&=15
+       cmpl    bits, %esi                                                      // op vs bits
+       jbe             1f                                                                      // if (bits < op) {
+       movzbl  (in), %eax                                                      //              *in
+       movl    bits, %ecx                                                      //              cl = bits
+       salq    %cl, %rax                                                       //              *in << bits
+       addq    %rax, hold                                                      //              hold += (unsigned long)(PUP(in)) << bits;
+       incq    in                                                                      //              in++
+       addl    $8, bits                                                        //              bits += 8
+       cmpl    bits, %esi                                                      //              op vs bits
+       jbe             1f                                                                      //              if (bits < op) {
+       movzbl  (in), %eax                                                      //                      *in
+       movl    bits, %ecx                                                      //                      cl = bits
+       salq    %cl, %rax                                                       //                      *in << bits
+       addq    %rax, hold                                                      //                      hold += (unsigned long)(PUP(in)) << bits;
+       incq    in                                                                      //                      in++
+       addl    $8, bits                                                        //                      bits += 8
+1:                                                                                             // }    }
+
+       movzwl  %dx, %edx                                                       // dist
+       movl    $1, %eax                                                        // 1
+       movl    %esi, %ecx                                                      // cl = op
+       sall    %cl, %eax                                                       // (1 << op)
+       decl    %eax                                                            // (1 << op) - 1
+       andl    holdd, %eax                                                     // (unsigned)hold & ((1U << op) - 1)
+       addl    %edx, %eax                                                      // dist += (unsigned)hold & ((1U << op) - 1);
+       movl    %eax, dist                                                      // save a copy of dist in stack
+
+#ifdef INFLATE_STRICT
+       cmp             %eax, dmax                                                      // dmax vs dist 
+       jb              L_invalid_distance_too_far_back         // if (dmax < dist) break for invalid distance too far back
+#endif
+
+       shrq    %cl, hold                                                       // hold >>= op;
+       subl    %esi, bits                                                      // bits -= op;
+       movl    out_d, %esi                                                     // out
+       movl    out_d, %eax                                                     // out
+       subl    beg, %eax                                                       // op = out - beg
+       cmpl    %eax, dist                                                      // dist vs op,  /* see if copy from window */
+       jbe             L_copy_direct_from_output                       // if (dist <= op) branch to copy direct from output
+
+L_distance_back_in_window:                     
+
+       movl    dist, %edx                                                      // dist
+       subl    %eax, %edx                                                      // op = dist - op;      /* distance back in window */
+
+       cmpl    %edx, whave                                                     // whave vs op
+       jb              L_invalid_distance_too_far_back         // if (op > whave), break for invalid distance too far back
+
+       testl   write, write                                            // if (write!=0)
+       jne             L_wrap_around_window                            //              branch to wrap around window
+
+L_very_common_case:
+
+       movl    wsize, %eax                                                     //      wsize
+       subl    %edx, %eax                                                      //      wsize - op
+       movq    window, from                                            //      from = window - OFF;
+       addq    %rax, from                                                      //      from += wsize - op;
+
+       movl    %edx, %esi                                                      //  op
+       cmpl    %edx, len                                                       //  len vs op
+       ja              L_some_from_window                                      //  if (len > op), branch for aligned code block L_some_from_window
+L38:
+       subl    $3, len                                                         // pre-decrement len by 3
+       jge             0f                                                                      // if len >= 3, branch to the aligned code block 
+1:     addl    $3, len                                                         // post-increment len by 3
+       je              L_do_while_loop_check                           // if (len==0) break to L_do_while_loop_check
+       movzbl  (from), %eax                                            // *from
+       movb    %al, (out)                                                      // *out
+       incq    out                                                                     // out++
+       cmpl    $2, len                                                         // len vs 2
+       jne             L_do_while_loop_check                           // if len!=2 break to L_do_while_loop_check
+       movzbl  1(from), %eax                                           // *from
+       movb    %al, (out)                                                      // *out
+       incq    out                                                                     // out++
+       jmp             L_do_while_loop_check                           // break to L_do_while_loop_check
+
+       .align 4,0x90
+0:                                                                                             // do {                         
+       movzbl  (from), %eax                                            //              *from
+       movb    %al, (out)                                                      //              *out
+       movzbl  1(from), %eax                                           //              *from
+       movb    %al, 1(out)                                                     //              *out
+       movzbl  2(from), %eax                                           //              *from
+       movb    %al, 2(out)                                                     //              *out
+       addq    $3, out                                                         //              out += 3
+       addq    $3, from                                                        //              from += 3
+       subl    $3, len                                                         //              len -= 3
+       jge             0b                                                                      // } while (len>=0);
+       jmp             1b                                                                      // branch back to the possibly unaligned code
+
+       .align 4,0x90
+L_end_of_block:
+       andl    $32, %ecx                                                       // op & 32
+       jne             L101                                                            // if (op&32) branch to end-of-block break
+       leaq    LC2(%rip), from
+       movq    from, 48(strm)                                          // state->mode
+       movl    $27, (state)                                            // state->mode = BAD;
+       movl    out_d, %esi
+
+L34:
+       movl    bits, %eax                                                      // bits
+       shrl    $3, %eax                                                        // len = bits >> 3;
+       mov             %eax, %edx                                                      // len
+       subq    %rdx, in                                                        // in -= len
+       sall    $3, %eax                                                        // len << 3
+       movl    bits, %ecx                                                      // bits
+       subl    %eax, %ecx                                                      // bits -= len << 3
+       movq    in, (strm)                                                      // strm->next_in = in + OFF;
+       movq    out, 24(strm)                                           // strm->next_out = out + OFF;
+       cmpq    in, last                                                        // last vs in
+       jbe             L67                                                                     // if (last <= in) branch to L67 and return to L69
+       movl    last, %eax                                                      // last
+       addl    $5, %eax                                                        // last + 5
+       subl    in_d, %eax                                                      // 5 + last - in
+L69:
+       movl    %eax, 8(strm)                                           // update strm->avail_in
+
+       cmpq    end, out                                                        // out vs end
+       jae             L70                                                                     // if out<=end branch to L70 and return to L72
+       movl    end, %eax                                                       // end
+       addl    $257, %eax                                                      // 257 + end
+       subl    %esi, %eax                                                      // 257 + end - out;
+L72:
+       movl    %eax, 32(strm)                                          // update strm->avail_out
+
+       movl    $1, %eax                                                        // 1
+       sall    %cl, %eax                                                       // 1 << bits
+       decl    %eax                                                            // (1U << bits) - 1
+       andq    hold, %rax                                                      // hold &= (1U << bits) - 1;
+       movq    %rax, 72(state)                                         // state->hold = hold;
+       movl    %ecx, 80(state)                                         // state->bits = bits;
+
+       // clear stack memory for local variables
+       addq    $88, %rsp
+
+       // restore registers from stack 
+       popq    %rbx
+       popq    %r12
+       popq    %r13
+       popq    %r14
+       popq    %r15
+
+       // return to caller
+       leave
+       ret
+
+       .align 4,0x90
+L99:
+       leal    8(bits), %esi                                           //              esi = bits+8
+       movzbl  (in), %edx                                                      //              1st *in 
+       movl    bits, %ecx                                                      //              cl = bits
+       salq    %cl, %rdx                                                       //              1st *in << 8
+       addq    %rdx, hold                                                      //              1st hold += (unsigned long)(PUP(in)) << bits;
+       movzbl  1(in), %eax                                                     //              2nd *in
+       movl    %esi, %ecx                                                      //              cl = bits + 8
+       salq    %cl, %rax                                                       //              2nd *in << bits+8       
+       addq    %rax, hold                                                      //              2nd hold += (unsigned long)(PUP(in)) << bits;
+       addq    $2, in                                                          //              in += 2
+       addl    $16, bits                                                       //              bits += 16
+       jmp             L19
+
+L101:
+       movl    $11, (state)
+       movl    out_d, %esi
+       jmp     L34
+       .align 4,0x90
+L70:
+       movl    end, %eax                                                       // end
+       subl    %esi, %eax                                                      // end - out
+       addl    $257, %eax                                                      // 257 + end - out
+       jmp             L72
+       .align 4,0x90
+L67:
+       movl    last, %eax                                                      // last
+       subl    in_d, %eax                                                      // last - in
+       addl    $5, %eax                                                        // 5 + last - in
+       jmp             L69
+
+
+       .align 4,0x90
+
+       // stuffing the following 4 bytes to align the major loop to a 16-byte boundary to give the better performance
+       .byte 0
+       .byte 0
+       .byte 0
+       .byte 0
+L_copy_direct_from_output:
+       mov             dist, %eax                                              // dist
+       movq    out, %rdx                                               // out
+       subq    %rax, %rdx                                              // from = out - dist;
+       subl    $3, len                                                 // pre-decrement len by 3
+                                                                                       // do {
+0:     movzbl  (%rdx), %eax                                    //      *from
+       movb    %al, (out)                                              //      *out
+       movzbl  1(%rdx), %eax                                   //      *from
+       movb    %al, 1(out)                                             //      *out
+       movzbl  2(%rdx), %eax                                   //      *from
+       movb    %al, 2(out)                                             //      *out
+       addq    $3, out                                                 //      out+=3
+       addq    $3, %rdx                                                //  from+=3
+       subl    $3, len                                                 //  len-=3
+       jge             0b                                                              // } while (len>=0);
+1:     addl    $3, len                                                 // post-increment len by 3
+       je              L_do_while_loop_check                   // if len==0, branch to do_while_loop_check
+
+       movzbl  (%rdx), %eax                                    // *from
+       movb    %al, (out)                                              // *out
+       incq    out                                                             // out++
+       cmpl    $2, len                                                 // len == 2 ?
+       jne             L_do_while_loop_check                   // if len==1, branch to do_while_loop_check
+
+       movzbl  1(%rdx), %eax                                   // *from
+       movb    %al, (out)                                              // *out
+       incq    out                                                             // out++
+       jmp     L_do_while_loop_check                           // branch to do_while_loop_check
+
+       .align 4,0x90
+L_some_from_window:            // from : from, out, %esi/%edx = op
+                                                                       // do {
+       movzbl  (from), %eax                    //      *from
+       movb    %al, (out)                              //      *out
+       incq    from                                    //      from++
+       incq    out                                             //      out++
+       decl    %esi                                    //      --op
+       jne             L_some_from_window              // } while (op);
+       subl    %edx, len                               // len -= op;
+       mov             dist, %eax                              // dist
+       movq    out, from                               // out
+       subq    %rax, from                              // from = out - dist;
+       jmp             L38                                             // copy from output
+
+       .align 4,0x90
+L_wrap_around_window:
+       cmpl    %edx, write                                     // write vs op
+       jae             L_contiguous_in_window          // if (write >= op) branch to contiguous in window
+       movl    wsize_write, %eax                       // wsize+write
+       subl    %edx, %eax                                      // wsize+write-op
+       movq    window, from                            // from = window - OFF
+       addq    %rax, from                                      // from += wsize+write-op
+       subl    write, %edx                                     // op -= write
+       cmpl    %edx, len                                       // len vs op
+       jbe             L38                                                     // if (len<=op) branch to copy from output
+       subl    %edx, len                                       // len -= op;
+0:                                                                             // do {
+       movzbl  (from), %eax                            //              *from
+       movb    %al, (out)                                      //              *out
+       incq    from                                            //              from++
+       incq    out                                                     //              out++
+       decl    %edx                                            //              op--
+       jne             0b                                                      // } while (op);
+       movq    window, from
+
+       cmpl    len, write                                      // write vs len
+       jae             L38                                                     // if (write >= len) branch to copy from output
+       movl    write, %esi                                     // op = write
+       subl    write, len                                      // len -= op
+1:                                                                             // do {
+       movzbl  (from), %eax                            //              *from   
+       movb    %al, (out)                                      //              *out
+       incq    from                                            //              from++
+       incq    out                                                     //              out++
+       decl    %esi                                            //              op--
+       jne             1b                                                      // } while (op);
+       mov             dist, %eax                                      // dist
+       movq    out, from                                       // out
+       subq    %rax, from                                      // from = out - dist;
+       jmp             L38
+
+       .align 4,0x90
+L_contiguous_in_window:
+       movl    write, %eax                                     // write
+       subl    %edx, %eax                                      // write - op
+       movq    window, from                            // from = window - OFF
+       addq    %rax, from                                      // from += write - op
+       cmpl    %edx, len                                       // len vs op
+       jbe             L38                                                     // if (len <= op) branch to copy from output
+       subl    %edx, len                                       // len -= op;
+2:                                                                             // do {
+       movzbl  (from), %eax                            //      *from
+       movb    %al, (out)                                      //      *out
+       incq    from                                            //      from++
+       incq    out                                                     //      out++
+       decl    %edx                                            //      op--
+       jne             2b                                                      // } while (op);
+
+       mov             dist, %eax                                      // dist
+       movq    out, from                                       // out
+       subq    %rax, from                                      // from = out - dist;
+       jmp             L38                                                     // copy from output
+
+       .align 4,0x90
+L_invalid_distance_code:
+       leaq    LC1(%rip), %rdx
+       movq    %rdx, 48(strm)
+       movl    $27, (state)
+       movl    out_d, %esi
+       jmp             L34
+
+L_invalid_distance_too_far_back:
+       leaq    LC0(%rip), %rbx
+       movq    %rbx, 48(strm)                          // error message
+       movl    $27, (state)                            // state->mode = BAD
+       jmp             L34
+
+#endif