xnu-6153.41.3.tar.gz

[apple/xnu.git] / bsd / netinet / tcp_lro.c
diff --git a/bsd/netinet/tcp_lro.c b/bsd/netinet/tcp_lro.c

index 0f127c7e3f2cd256f3dc92605b486f503a882322..8aef977e3b53d0b40caeadae6cd76d15942cd4de 100644 (file)
--- a/bsd/netinet/tcp_lro.c
+++ b/bsd/netinet/tcp_lro.c
@@ -1,8 +1,8 @@
  /*
- * Copyright (c) 2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
   * This file contains Original Code and/or Modifications of Original Code
   * as defined in and that are subject to the Apple Public Source License
   * Version 2.0 (the 'License'). You may not use this file except in
@@ -11,10 +11,10 @@
   * unlawful or unlicensed copies of an Apple operating system, or to
   * circumvent, violate, or enable the circumvention or violation of, any
   * terms of an Apple operating system software license agreement.
- * 
+ *
   * Please obtain a copy of the License at
   * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
   * The Original Code and all software distributed under the License are
   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   * Please see the License for the specific language governing rights and
   * limitations under the License.
- * 
+ *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   */
  
@@ -38,6 +38,7 @@
  #include <netinet/in.h>
  #include <netinet/in_systm.h>
  #include <net/if.h>
+#include <net/dlil.h>
  #include <netinet/ip.h>
  #include <netinet/ip_var.h>
  #include <netinet/in_var.h>
@@ -52,7 +53,6 @@
  unsigned int lrocount = 0; /* A counter used for debugging only */
  unsigned int lro_seq_outoforder = 0; /* Counter for debugging */
  unsigned int lro_seq_mismatch = 0; /* Counter for debugging */
-unsigned int lro_eject_req = 0; /* Counter for tracking flow ejections */
  unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */
  unsigned int lro_single_flushes = 0;
  unsigned int lro_double_flushes = 0;
@@ -60,20 +60,20 @@ unsigned int lro_good_flushes = 0;
  
  unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS;
  SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED,
-               &coalesc_sz, 0, "Max coalescing size");
+    &coalesc_sz, 0, "Max coalescing size");
  
  unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER;
  SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED,
-               &coalesc_time, 0, "Max coalescing time");
+    &coalesc_time, 0, "Max coalescing time");
  
-struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS]; 
+struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS];
  
-char lro_flow_map[TCP_LRO_FLOW_MAP]; 
+char lro_flow_map[TCP_LRO_FLOW_MAP];
  
-static lck_attr_t *tcp_lro_mtx_attr = NULL;            /* mutex attributes */
-static lck_grp_t *tcp_lro_mtx_grp = NULL;              /* mutex group */
-static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL;    /* mutex group attrs */
-decl_lck_mtx_data( ,tcp_lro_lock);     /* Used to synchronize updates */
+static lck_attr_t *tcp_lro_mtx_attr = NULL;             /* mutex attributes */
+static lck_grp_t *tcp_lro_mtx_grp = NULL;               /* mutex group */
+static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL;     /* mutex group attrs */
+decl_lck_mtx_data(, tcp_lro_lock);      /* Used to synchronize updates */
  
  unsigned int lro_byte_count = 0;
  
@@ -86,24 +86,23 @@ thread_call_t tcp_lro_timer;
  
  extern u_int32_t kipf_count;
  
-static void    tcp_lro_timer_proc(void*, void*);
-static void    lro_update_stats(struct mbuf*);
-static void    lro_update_flush_stats(struct mbuf *);
-static void    tcp_lro_flush_flows(void);
-static void    tcp_lro_sched_timer(uint64_t);
-static void    lro_proto_input(struct mbuf *);
+static void     tcp_lro_timer_proc(void*, void*);
+static void     lro_update_stats(struct mbuf*);
+static void     lro_update_flush_stats(struct mbuf *);
+static void     tcp_lro_flush_flows(void);
+static void     tcp_lro_sched_timer(uint64_t);
+static void     lro_proto_input(struct mbuf *);
  
-static struct mbuf *lro_tcp_xsum_validate(struct mbuf*,  struct ipovly *,
-                               struct tcphdr*);
-static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*,
-                               int);
+static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ip *,
+    struct tcphdr*);
+static struct mbuf *tcp_lro_process_pkt(struct mbuf*, int);
  
  void
  tcp_lro_init(void)
  {
         int i;
  
-       bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS);
+       bzero(lro_flow_list, sizeof(struct lro_flow) * TCP_LRO_NUM_FLOWS);
         for (i = 0; i < TCP_LRO_FLOW_MAP; i++) {
                 lro_flow_map[i] = TCP_LRO_FLOW_UNINIT;
         }
@@ -125,16 +124,16 @@ tcp_lro_init(void)
  }
  
  static int
-tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash, 
-                       int *flow_id )
+tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
+    int *flow_id )
  {
         struct lro_flow *flow;
         tcp_seq seqnum;
         unsigned int off = 0;
         int payload_len = 0;
  
-       *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 
-               tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));
+       *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
+           tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));
  
         *flow_id = lro_flow_map[*hash];
         if (*flow_id == TCP_LRO_FLOW_NOTFOUND) {
@@ -148,46 +147,46 @@ tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
         flow = &lro_flow_list[*flow_id];
  
         if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
-                       (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
-                       (flow->lr_fport == tcp_hdr->th_sport) &&
-                       (flow->lr_lport == tcp_hdr->th_dport)) {
+           (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
+           (flow->lr_fport == tcp_hdr->th_sport) &&
+           (flow->lr_lport == tcp_hdr->th_dport)) {
                 if (flow->lr_tcphdr == NULL) {
                         if (ntohl(seqnum) == flow->lr_seq) {
                                 return TCP_LRO_COALESCE;
                         }
                         if (lrodebug >= 4) {
                                 printf("%s: seqnum = %x, lr_seq = %x\n",
-                                       __func__, ntohl(seqnum), flow->lr_seq);
+                                   __func__, ntohl(seqnum), flow->lr_seq);
                         }
                         lro_seq_mismatch++;
                         if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) {
                                 lro_seq_outoforder++;
-                               /* 
+                               /*
                                  * Whenever we receive out of order packets it
-                                * signals loss and recovery and LRO doesn't 
+                                * signals loss and recovery and LRO doesn't
                                  * let flows recover quickly. So eject.
                                  */
-                                flow->lr_flags |= LRO_EJECT_REQ;
-
+                               flow->lr_flags |= LRO_EJECT_REQ;
                         }
                         return TCP_LRO_NAN;
                 }
  
                 if (flow->lr_flags & LRO_EJECT_REQ) {
-                       if (lrodebug)
+                       if (lrodebug) {
                                 printf("%s: eject. \n", __func__);
+                       }
                         return TCP_LRO_EJECT_FLOW;
                 }
-               if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) { 
+               if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) {
                         if (lrodebug) {
-                               printf("%s: th_ack = %x flow_ack = %x \n", 
-                                       __func__, tcp_hdr->th_ack, 
-                                       flow->lr_tcphdr->th_ack);
+                               printf("%s: th_ack = %x flow_ack = %x \n",
+                                   __func__, tcp_hdr->th_ack,
+                                   flow->lr_tcphdr->th_ack);
                         }
                         return TCP_LRO_EJECT_FLOW;
                 }
  
-               if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) { 
+               if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) {
                         return TCP_LRO_COALESCE;
                 } else {
                         /* LRO does not handle loss recovery well, eject */
@@ -195,13 +194,15 @@ tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
                         return TCP_LRO_EJECT_FLOW;
                 }
         }
-       if (lrodebug) printf("tcp_lro_matching_tuple: collision \n");
+       if (lrodebug) {
+               printf("tcp_lro_matching_tuple: collision \n");
+       }
         return TCP_LRO_COLLISION;
  }
  
  static void
-tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr, 
-                       int hash, u_int32_t timestamp, int payload_len)
+tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr,
+    int hash, u_int32_t timestamp, int payload_len)
  {
         struct lro_flow *flow = NULL;
  
@@ -220,9 +221,9 @@ tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr,
  }
  
  static void
-tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr, 
-                       int payload_len, int drop_hdrlen, struct tcpopt *topt, 
-                       u_int32_t* tsval, u_int32_t* tsecr, int thflags)
+tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
+    int payload_len, int drop_hdrlen, struct tcpopt *topt,
+    u_int32_t* tsval, u_int32_t* tsecr, int thflags)
  {
         struct lro_flow *flow = NULL;
         struct mbuf *last;
@@ -230,9 +231,10 @@ tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
  
         flow =  &lro_flow_list[flow_id];
         if (flow->lr_mhead) {
-               if (lrodebug) 
+               if (lrodebug) {
                         printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq,
-                               payload_len);
+                           payload_len);
+               }
                 m_adj(lro_mb, drop_hdrlen);
  
                 last = flow->lr_mtail;
@@ -252,33 +254,33 @@ tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
                 }
                 flow->lr_len += payload_len;
                 flow->lr_seq += payload_len;
-               /* 
-                * This bit is re-OR'd each time a packet is added to the 
+               /*
+                * This bit is re-OR'd each time a packet is added to the
                  * large coalesced packet.
                  */
-               flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT;
+               flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
                 flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */
-               if (flow->lr_mhead->m_pkthdr.lro_pktlen < 
-                               lro_mb->m_pkthdr.lro_pktlen) {
-                       /* 
-                        * For TCP Inter Arrival Jitter calculation, return max  
+               if (flow->lr_mhead->m_pkthdr.lro_pktlen <
+                   lro_mb->m_pkthdr.lro_pktlen) {
+                       /*
+                        * For TCP Inter Arrival Jitter calculation, return max
                          * size encountered while coalescing a stream of pkts.
                          */
-                       flow->lr_mhead->m_pkthdr.lro_pktlen = 
-                                               lro_mb->m_pkthdr.lro_pktlen;
+                       flow->lr_mhead->m_pkthdr.lro_pktlen =
+                           lro_mb->m_pkthdr.lro_pktlen;
                 }
-               /* Update the timestamp value */
+               /* Update the timestamp value */
                 if (topt->to_flags & TOF_TS) {
-                       if ((flow->lr_tsval) && 
-                               (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
+                       if ((flow->lr_tsval) &&
+                           (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
                                 *(flow->lr_tsval) = htonl(topt->to_tsval);
                         }
                         if ((flow->lr_tsecr) &&
-                               (topt->to_tsecr != 0) &&
-                               (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
+                           (topt->to_tsecr != 0) &&
+                           (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
                                 if (lrodebug >= 2) {
-                                       printf("%s: instantaneous RTT = %d \n", __func__, 
-                                               topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
+                                       printf("%s: instantaneous RTT = %d \n", __func__,
+                                           topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
                                 }
                                 *(flow->lr_tsecr) = htonl(topt->to_tsecr);
                         }
@@ -292,23 +294,24 @@ tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
         } else {
                 if (lro_mb) {
                         flow->lr_mhead = flow->lr_mtail = lro_mb;
-                       flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT;
+                       flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
                         flow->lr_tcphdr = tcphdr;
                         if ((topt) && (topt->to_flags & TOF_TS)) {
                                 ASSERT(tsval != NULL);
                                 ASSERT(tsecr != NULL);
-                               flow->lr_tsval = tsval; 
+                               flow->lr_tsval = tsval;
                                 flow->lr_tsecr = tsecr;
-                       }        
+                       }
                         flow->lr_len = payload_len;
+                       calculate_tcp_clock();
                         flow->lr_timestamp = tcp_now;
                         tcp_lro_sched_timer(0);
-               }       
+               }
                 flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len;
         }
-       if (lro_mb) { 
+       if (lro_mb) {
                 tcpstat.tcps_coalesced_pack++;
-       }       
+       }
         return;
  }
  
@@ -321,7 +324,7 @@ tcp_lro_eject_flow(int flow_id)
         ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id);
         lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT;
         bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow));
-       
+
         return mb;
  }
  
@@ -330,27 +333,27 @@ tcp_lro_eject_coalesced_pkt(int flow_id)
  {
         struct mbuf *mb = NULL;
         mb = lro_flow_list[flow_id].lr_mhead;
-       lro_flow_list[flow_id].lr_mhead = 
-               lro_flow_list[flow_id].lr_mtail = NULL;
+       lro_flow_list[flow_id].lr_mhead =
+           lro_flow_list[flow_id].lr_mtail = NULL;
         lro_flow_list[flow_id].lr_tcphdr = NULL;
         return mb;
  }
  
  static struct mbuf*
-tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr, 
-                       struct tcphdr *tcp_hdr, int payload_len, 
-                       int drop_hdrlen, int hash, struct tcpopt *topt, 
-                       u_int32_t *tsval, u_int32_t *tsecr)
+tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr,
+    struct tcphdr *tcp_hdr, int payload_len,
+    int drop_hdrlen, int hash, struct tcpopt *topt,
+    u_int32_t *tsval, u_int32_t *tsecr)
  {
         int i;
         int slot_available = 0;
-       int candidate_flow = 0; 
+       int candidate_flow = 0;
         u_int32_t oldest_timestamp;
         struct mbuf *mb = NULL;
         int collision = 0;
  
         oldest_timestamp = tcp_now;
-       
+
         /* handle collision */
         if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) {
                 if (lrodebug) {
@@ -381,27 +384,25 @@ kick_flow:
  
                 if (lrodebug) {
                         if (!slot_available) {
-                               printf("%s: slot unavailable.\n",__func__);
+                               printf("%s: slot unavailable.\n", __func__);
                         }
                         if (collision) {
-                               printf("%s: collision.\n",__func__);
+                               printf("%s: collision.\n", __func__);
                         }
                 }
         } else {
                 candidate_flow = i; /* this is now the flow to be used */
-
         }
  
-       tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash, 
-                               tcp_now, payload_len);
-       tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len, 
-                               drop_hdrlen, topt, tsval, tsecr, 0);
+       tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash,
+           tcp_now, payload_len);
+       tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len,
+           drop_hdrlen, topt, tsval, tsecr, 0);
         return mb;
  }
  
  struct mbuf*
-tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, 
-                               struct tcphdr *tcp_hdr, int drop_hdrlen)
+tcp_lro_process_pkt(struct mbuf *lro_mb, int drop_hdrlen)
  {
         int flow_id = TCP_LRO_FLOW_UNINIT;
         int hash;
@@ -417,11 +418,13 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
         int ret_response = TCP_LRO_CONSUMED;
         int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0;
         u_int8_t ecn;
-       
-       if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) {
-               if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) {
+       struct ip *ip_hdr;
+       struct tcphdr *tcp_hdr;
+
+       if (lro_mb->m_len < drop_hdrlen) {
+               if ((lro_mb = m_pullup(lro_mb, drop_hdrlen)) == NULL) {
                         tcpstat.tcps_rcvshort++;
-                       m_freem(lro_mb); 
+                       m_freem(lro_mb);
                         if (lrodebug) {
                                 printf("tcp_lro_process_pkt:mbuf too short.\n");
                         }
@@ -429,22 +432,27 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
                 }
         }
  
-       if ((lro_mb = lro_tcp_xsum_validate(lro_mb, 
-                               (struct ipovly*)ip_hdr, tcp_hdr)) == NULL) {
+       ip_hdr = mtod(lro_mb, struct ip*);
+       tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + sizeof(struct ip));
+
+       /* Just in case */
+       lro_mb->m_pkthdr.pkt_flags &= ~PKTF_SW_LRO_DID_CSUM;
+
+       if ((lro_mb = lro_tcp_xsum_validate(lro_mb, ip_hdr, tcp_hdr)) == NULL) {
                 if (lrodebug) {
                         printf("tcp_lro_process_pkt: TCP xsum failed.\n");
                 }
-               return NULL; 
+               return NULL;
         }
  
         /* Update stats */
         lro_pkt_count++;
  
         /* Avoids checksumming in tcp_input */
-       lro_mb->m_pkthdr.aux_flags |= MAUXF_SW_LRO_DID_CSUM;    
-       
+       lro_mb->m_pkthdr.pkt_flags |= PKTF_SW_LRO_DID_CSUM;
+
         off = tcp_hdr->th_off << 2;
-       optlen = off - sizeof (struct tcphdr);
+       optlen = off - sizeof(struct tcphdr);
         payload_len = ip_hdr->ip_len - off;
         optp = (u_char *)(tcp_hdr + 1);
         /*
@@ -454,18 +462,19 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
          * quickly get the values now and not bother calling
          * tcp_dooptions(), etc.
          */
+       bzero(&to, sizeof(to));
         if ((optlen == TCPOLEN_TSTAMP_APPA ||
-                       (optlen > TCPOLEN_TSTAMP_APPA &&
-                       optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
-                       *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
-                       (tcp_hdr->th_flags & TH_SYN) == 0) {
-                       to.to_flags |= TOF_TS;
-                       to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
-                       to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
+           (optlen > TCPOLEN_TSTAMP_APPA &&
+           optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
+           *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
+           (tcp_hdr->th_flags & TH_SYN) == 0) {
+               to.to_flags |= TOF_TS;
+               to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
+               to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
         } else {
                 /*
-                * If TCP timestamps are not in use, or not the first option, 
-                * skip LRO path since timestamps are used to avoid LRO 
+                * If TCP timestamps are not in use, or not the first option,
+                * skip LRO path since timestamps are used to avoid LRO
                  * from introducing additional latencies for retransmissions
                  * and other slow-paced transmissions.
                  */
@@ -474,17 +483,17 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
         }
  
         /* list all the conditions that can trigger a flow ejection here */
-       
+
         thflags = tcp_hdr->th_flags;
-       if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) { 
+       if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) {
                 eject_flow = tcpflags = 1;
-       } 
-       
-       if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) && 
-                       (to.to_flags & TOF_TS))) {
+       }
+
+       if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) &&
+           (to.to_flags & TOF_TS))) {
                 eject_flow = unknown_tcpopts = 1;
-       } 
-       
+       }
+
         if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */
                 eject_flow = 1;
         }
@@ -512,19 +521,19 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
                 break;
  
         case TCP_LRO_COALESCE:
-               if ((payload_len != 0) && (unknown_tcpopts == 0) && 
-                       (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) { 
+               if ((payload_len != 0) && (unknown_tcpopts == 0) &&
+                   (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) {
                         tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len,
-                               drop_hdrlen, &to, 
-                               (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL,
-                               (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL,
-                               thflags);
-                       if (lrodebug >= 2) { 
+                           drop_hdrlen, &to,
+                           (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL,
+                           (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL,
+                           thflags);
+                       if (lrodebug >= 2) {
                                 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
-                                       lro_flow_list[flow_id].lr_len, flow_id, 
-                                       payload_len, drop_hdrlen, optlen,
-                                       ntohs(lro_flow_list[flow_id].lr_lport),
-                                       ntohl(tcp_hdr->th_seq));
+                                   lro_flow_list[flow_id].lr_len, flow_id,
+                                   payload_len, drop_hdrlen, optlen,
+                                   ntohs(lro_flow_list[flow_id].lr_lport),
+                                   ntohl(tcp_hdr->th_seq));
                         }
                         if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) {
                                 eject_flow = 1;
@@ -534,9 +543,12 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
                 if (eject_flow) {
                         mb = tcp_lro_eject_coalesced_pkt(flow_id);
                         lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) +
-                                                               payload_len;
+                           payload_len;
+                       calculate_tcp_clock();
+                       u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
                         lck_mtx_unlock(&tcp_lro_lock);
                         if (mb) {
+                               mb->m_pkthdr.lro_elapsed = timestamp;
                                 lro_proto_input(mb);
                         }
                         if (!coalesced) {
@@ -552,10 +564,14 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
  
         case TCP_LRO_EJECT_FLOW:
                 mb = tcp_lro_eject_coalesced_pkt(flow_id);
+               calculate_tcp_clock();
+               u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
                 lck_mtx_unlock(&tcp_lro_lock);
                 if (mb) {
-                       if (lrodebug) 
+                       if (lrodebug) {
                                 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len);
+                       }
+                       mb->m_pkthdr.lro_elapsed = timestamp;
                         lro_proto_input(mb);
                 }
  
@@ -570,7 +586,6 @@ tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
         default:
                 lck_mtx_unlock(&tcp_lro_lock);
                 panic_plain("%s: unrecognized type %d", __func__, retval);
-               break; 
         }
  
         if (ret_response == TCP_LRO_FLOW_NOTFOUND) {
@@ -596,8 +611,6 @@ tcp_lro_flush_flows(void)
         int i = 0;
         struct mbuf *mb;
         struct lro_flow *flow;
-       int active_flows = 0;
-       int outstanding_flows = 0;
         int tcpclock_updated = 0;
  
         lck_mtx_lock(&tcp_lro_lock);
@@ -605,74 +618,33 @@ tcp_lro_flush_flows(void)
         while (i < TCP_LRO_NUM_FLOWS) {
                 flow = &lro_flow_list[i];
                 if (flow->lr_mhead != NULL) {
-                       active_flows++;
                         if (!tcpclock_updated) {
                                 calculate_tcp_clock();
                                 tcpclock_updated = 1;
                         }
-                       if (((tcp_now - flow->lr_timestamp) >= coalesc_time) || 
-                               (flow->lr_mhead->m_pkthdr.lro_npkts >= 
-                                       coalesc_sz)) {
-
-                               if (lrodebug >= 2) 
-                                       printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
-                                       flow->lr_len, 
-                                       flow->lr_mhead->m_pkthdr.lro_npkts, 
-                                       flow->lr_timestamp, tcp_now);
-
-                               mb = tcp_lro_eject_flow(i);
-
-                               if (mb) {
-                                       lck_mtx_unlock(&tcp_lro_lock);
-                                       lro_update_flush_stats(mb);
-                                       lro_proto_input(mb);
-                                       lck_mtx_lock(&tcp_lro_lock);
-                               }
  
-                       } else {
-                               tcp_lro_sched_timer(0);
-                               outstanding_flows++;
-                               if (lrodebug >= 2) {
-                                       printf("tcp_lro_flush_flows: did not flush flow of len =%d deadline = %x timestamp = %x \n", 
-                                               flow->lr_len, tcp_now, flow->lr_timestamp);
-                               }
+                       if (lrodebug >= 2) {
+                               printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
+                                   flow->lr_len,
+                                   flow->lr_mhead->m_pkthdr.lro_npkts,
+                                   flow->lr_timestamp, tcp_now);
                         }
-               }
-               if (flow->lr_flags & LRO_EJECT_REQ) {
+
+                       u_int8_t timestamp = tcp_now - flow->lr_timestamp;
+
                         mb = tcp_lro_eject_flow(i);
+
                         if (mb) {
+                               mb->m_pkthdr.lro_elapsed = timestamp;
                                 lck_mtx_unlock(&tcp_lro_lock);
+                               lro_update_flush_stats(mb);
                                 lro_proto_input(mb);
-                               lro_eject_req++;
                                 lck_mtx_lock(&tcp_lro_lock);
                         }
                 }
                 i++;
         }
         lck_mtx_unlock(&tcp_lro_lock);
-#if 0
-       if (lrocount == 900) {
-               printf("%s: %d %d %d %d oo: %d mismatch: %d ej_req: %d coll: %d \n", 
-                       __func__,
-                       tcpstat.tcps_coalesced_pack,
-                       tcpstat.tcps_lro_twopack,
-                       tcpstat.tcps_lro_multpack, 
-                       tcpstat.tcps_lro_largepack,
-                       lro_seq_outoforder,
-                       lro_seq_mismatch,
-                       lro_eject_req,
-                       tcpstat.tcps_flowtbl_collision);
-               printf("%s: all: %d single: %d double: %d good: %d \n",
-                       __func__, lro_flushes, lro_single_flushes, 
-                       lro_double_flushes, lro_good_flushes);
-               lrocount = 0;   
-       } else {
-               lrocount++;
-       }
-       if ((lrodebug >= 2) && (active_flows > 1)) {
-               printf("lro_flush_flows: active_flows = %d \n", active_flows);
-       }
-#endif 
  }
  
  /*
@@ -690,11 +662,11 @@ tcp_lro_sched_timer(uint64_t hint)
         lro_timer_set = 1;
         if (!hint) {
                 /* the intent is to wake up every coalesc_time msecs */
-               clock_interval_to_deadline(coalesc_time, 
-                       (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline);
+               clock_interval_to_deadline(coalesc_time,
+                   (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline);
         } else {
                 clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ,
-                        &lro_deadline);
+                   &lro_deadline);
         }
         thread_call_enter_delayed(tcp_lro_timer, lro_deadline);
  }
@@ -707,54 +679,64 @@ tcp_lro(struct mbuf *m, unsigned int hlen)
         struct tcphdr * tcp_hdr = NULL;
         unsigned int off = 0;
  
-       if (kipf_count != 0) 
+       if (kipf_count != 0) {
                 return m;
+       }
  
-       /* 
-        * Experiments on cellular show that the RTT is much higher  
+       /*
+        * Experiments on cellular show that the RTT is much higher
          * than the coalescing time of 5 msecs, causing lro to flush
-        * 80% of the time on a single packet. Increasing 
-        * coalescing time for cellular does not show marked 
+        * 80% of the time on a single packet. Increasing
+        * coalescing time for cellular does not show marked
          * improvement to throughput either. Loopback perf is hurt
          * by the 5 msec latency and it already sends large packets.
          */
-       if ((m->m_pkthdr.rcvif->if_type == IFT_CELLULAR) ||
-               (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) {
+       if (IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) ||
+           (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) {
                 return m;
         }
  
         ip_hdr = mtod(m, struct ip*);
  
+       /* don't deal with IP options */
+       if (hlen != sizeof(struct ip)) {
+               return m;
+       }
+
         /* only TCP is coalesced */
         if (ip_hdr->ip_p != IPPROTO_TCP) {
                 return m;
         }
  
-       if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) {
-               if (lrodebug) printf("tcp_lro m_pullup \n");
-               if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
-                       tcpstat.tcps_rcvshort++; 
+       if (m->m_len < (int32_t) sizeof(struct tcpiphdr)) {
+               if (lrodebug) {
+                       printf("tcp_lro m_pullup \n");
+               }
+               if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == NULL) {
+                       tcpstat.tcps_rcvshort++;
                         if (lrodebug) {
                                 printf("ip_lro: rcvshort.\n");
                         }
                         return NULL;
                 }
+               ip_hdr = mtod(m, struct ip*);
         }
  
         tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen);
-       tlen = ip_hdr->ip_len ; //ignore IP header bytes len
+       tlen = ip_hdr->ip_len;  //ignore IP header bytes len
         m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */
         m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */
+       m->m_pkthdr.lro_elapsed = 0; /* Initialize the field to carry elapsed time */
         off = tcp_hdr->th_off << 2;
-       if (off < sizeof (struct tcphdr) || off > tlen) {
-               tcpstat.tcps_rcvbadoff++; 
+       if (off < sizeof(struct tcphdr) || off > tlen) {
+               tcpstat.tcps_rcvbadoff++;
                 if (lrodebug) {
                         printf("ip_lro: TCP off greater than TCP header.\n");
                 }
                 return m;
         }
  
-       return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off));
+       return tcp_lro_process_pkt(m, hlen + off);
  }
  
  static void
@@ -763,101 +745,49 @@ lro_proto_input(struct mbuf *m)
         struct ip* ip_hdr = mtod(m, struct ip*);
  
         if (lrodebug >= 3) {
-               printf("lro_proto_input: ip_len = %d \n", 
-                       ip_hdr->ip_len);
+               printf("lro_proto_input: ip_len = %d \n",
+                   ip_hdr->ip_len);
         }
         lro_update_stats(m);
         ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p);
  }
  
  static struct mbuf *
-lro_tcp_xsum_validate(struct mbuf *m,  struct ipovly *ipov, struct tcphdr * th)
+lro_tcp_xsum_validate(struct mbuf *m, struct ip *ip, struct tcphdr * th)
  {
-
-       struct ip* ip = (struct ip*)ipov;
-       int tlen = ip->ip_len;
-       int len;
-       struct ifnet *ifp = ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) ? 
-                               m->m_pkthdr.rcvif: NULL;
-
         /* Expect 32-bit aligned data pointer on strict-align platforms */
         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
  
-       if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
-               if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) {
-                       u_short pseudo;
-                       char b[9];
-
-                       bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
-                       bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
-                       ipov->ih_len = (u_short)tlen;
-#if BYTE_ORDER != BIG_ENDIAN
-                       HTONS(ipov->ih_len);
-#endif
-                       pseudo = in_cksum(m, sizeof (struct ip));
-                       bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
-
-                       th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
-               } else {
-                       if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
-                               th->th_sum = m->m_pkthdr.csum_data;
-                       else
-                               th->th_sum = in_pseudo(ip->ip_src.s_addr,
-                                       ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
-                                       ip->ip_len + IPPROTO_TCP));
-               }
-               th->th_sum ^= 0xffff;
-       } else {
-               char b[9];
-               /*
-                * Checksum extended TCP header and data.
-                */
-               bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
-               bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
-               ipov->ih_len = (u_short)tlen;
-#if BYTE_ORDER != BIG_ENDIAN
-               HTONS(ipov->ih_len);
-#endif
-               len = sizeof (struct ip) + tlen;
-               th->th_sum = in_cksum(m, len);
-               bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
-
-               tcp_in_cksum_stats(len);
-       }
-       if (th->th_sum) {
-               tcpstat.tcps_rcvbadsum++;
-               if (ifp != NULL && ifp->if_tcp_stat != NULL) {
-                       atomic_add_64(&ifp->if_tcp_stat->badformat, 1);
+       /* we shouldn't get here for IP with options; hence sizeof (ip) */
+       if (tcp_input_checksum(AF_INET, m, th, sizeof(*ip), ip->ip_len)) {
+               if (lrodebug) {
+                       printf("%s: bad xsum and drop m = 0x%llx.\n", __func__,
+                           (uint64_t)VM_KERNEL_ADDRPERM(m));
                 }
-               if (lrodebug) 
-                       printf("lro_tcp_xsum_validate: bad xsum and drop m = %p.\n",m);
                 m_freem(m);
                 return NULL;
         }
-       /* revert back the order as IP will look into this again. */
-#if BYTE_ORDER != BIG_ENDIAN
-       NTOHS(ipov->ih_len);
-#endif
+
         return m;
  }
  
  /*
- * When TCP detects a stable, steady flow without out of ordering, 
+ * When TCP detects a stable, steady flow without out of ordering,
   * with a sufficiently high cwnd, it invokes LRO.
   */
  int
-tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen) 
+tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
  {
         int hash;
         int flow_id;
         struct mbuf *eject_mb;
         struct lro_flow *lf;
  
-       hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 
-               tcp_hdr->th_sport, tcp_hdr->th_dport,
-               (TCP_LRO_FLOW_MAP - 1));
+       hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
+           tcp_hdr->th_sport, tcp_hdr->th_dport,
+           (TCP_LRO_FLOW_MAP - 1));
+
  
-       
         lck_mtx_lock_spin(&tcp_lro_lock);
         flow_id = lro_flow_map[hash];
         if (flow_id != TCP_LRO_FLOW_NOTFOUND) {
@@ -866,21 +796,21 @@ tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
                     (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
                     (lf->lr_fport == tcp_hdr->th_sport) &&
                     (lf->lr_lport == tcp_hdr->th_dport)) {
-                       if ((lf->lr_tcphdr == NULL) &&
-                               (lf->lr_seq != (tcp_hdr->th_seq + tlen))) {
+                       if ((lf->lr_tcphdr == NULL) &&
+                           (lf->lr_seq != (tcp_hdr->th_seq + tlen))) {
                                 lf->lr_seq = tcp_hdr->th_seq + tlen;
-                       }       
+                       }
                         lf->lr_flags &= ~LRO_EJECT_REQ;
                 }
-               lck_mtx_unlock(&tcp_lro_lock); 
+               lck_mtx_unlock(&tcp_lro_lock);
                 return 0;
         }
  
         HTONL(tcp_hdr->th_seq);
         HTONL(tcp_hdr->th_ack);
-       eject_mb = 
-               tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash, 
-               NULL, NULL, NULL);
+       eject_mb =
+           tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash,
+           NULL, NULL, NULL);
  
         lck_mtx_unlock(&tcp_lro_lock);
  
@@ -888,8 +818,8 @@ tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
         NTOHL(tcp_hdr->th_ack);
         if (lrodebug >= 3) {
                 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
-                       __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
-                       tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq);
+                   __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
+                   tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq);
         }
         ASSERT(eject_mb == NULL);
         return 0;
@@ -897,17 +827,17 @@ tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
  
  /*
   * When TCP detects loss or idle condition, it stops offloading
- * to LRO. 
+ * to LRO.
   */
  int
-tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr, 
-               unsigned short sport, unsigned short dport)
+tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
+    unsigned short sport, unsigned short dport)
  {
         int hash, flow_id;
         struct lro_flow *lf;
  
         hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
-               (TCP_LRO_FLOW_MAP - 1));
+           (TCP_LRO_FLOW_MAP - 1));
         lck_mtx_lock_spin(&tcp_lro_lock);
         flow_id = lro_flow_map[hash];
         if (flow_id == TCP_LRO_FLOW_UNINIT) {
@@ -915,13 +845,13 @@ tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
                 return 0;
         }
         lf = &lro_flow_list[flow_id];
-       if ((lf->lr_faddr.s_addr == daddr.s_addr) && 
+       if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
             (lf->lr_laddr.s_addr == saddr.s_addr) &&
             (lf->lr_fport == dport) &&
             (lf->lr_lport == sport)) {
                 if (lrodebug) {
-                       printf("%s: %x %x\n", __func__, 
-                               lf->lr_flags, lf->lr_seq);
+                       printf("%s: %x %x\n", __func__,
+                           lf->lr_flags, lf->lr_seq);
                 }
                 lf->lr_flags |= LRO_EJECT_REQ;
         }
@@ -931,13 +861,13 @@ tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
  
  void
  tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr,
-               unsigned short sport, unsigned short dport)
+    unsigned short sport, unsigned short dport)
  {
         int hash, flow_id;
         struct lro_flow *lf;
  
-       hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, 
-               (TCP_LRO_FLOW_MAP - 1));
+       hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
+           (TCP_LRO_FLOW_MAP - 1));
         lck_mtx_lock_spin(&tcp_lro_lock);
         flow_id = lro_flow_map[hash];
         if (flow_id == TCP_LRO_FLOW_UNINIT) {
@@ -959,21 +889,21 @@ tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr dadd
  static void
  lro_update_stats(struct mbuf *m)
  {
-       switch(m->m_pkthdr.lro_npkts) {
+       switch (m->m_pkthdr.lro_npkts) {
         case 0: /* fall through */
-       case 1: 
+       case 1:
                 break;
-       
-       case 2: 
+
+       case 2:
                 tcpstat.tcps_lro_twopack++;
                 break;
-       
+
         case 3: /* fall through */
         case 4:
                 tcpstat.tcps_lro_multpack++;
                 break;
-       
-       default: 
+
+       default:
                 tcpstat.tcps_lro_largepack++;
                 break;
         }
@@ -984,7 +914,7 @@ static void
  lro_update_flush_stats(struct mbuf *m)
  {
         lro_flushes++;
-       switch(m->m_pkthdr.lro_npkts) {
+       switch (m->m_pkthdr.lro_npkts) {
         case 0: ASSERT(0);
         case 1: lro_single_flushes++;
                 break;