xnu-4570.20.62.tar.gz

[apple/xnu.git] / bsd / nfs / nfs_syscalls.c
diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c

index 5fa063cea66d671d0edbf09d5b8b579e3d4a9d80..2a20ee2eae1f1403a254206f9ba7594e3868ef2a 100644 (file)
--- a/bsd/nfs/nfs_syscalls.c
+++ b/bsd/nfs/nfs_syscalls.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc.  All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc.  All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -131,7 +131,10 @@ extern int nfsrv_wg_delay;
  extern int nfsrv_wg_delay_v3;
  
  static int nfsrv_require_resv_port = 0;
-static int nfsrv_deadsock_timer_on = 0;
+static time_t  nfsrv_idlesock_timer_on = 0;
+static int nfsrv_sock_tcp_cnt = 0;
+#define NFSD_MIN_IDLE_TIMEOUT 30
+static int nfsrv_sock_idle_timeout = 3600; /* One hour */
  
  int    nfssvc_export(user_addr_t argp);
  int    nfssvc_nfsd(void);
@@ -160,7 +163,6 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CT
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, "");
-SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, "");
@@ -170,7 +172,8 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LO
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, "");
  SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, readlink_nocache, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_readlink_nocache, 0, "");
-
+SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, root_steals_gss_context, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_root_steals_ctx, 0, "");
+SYSCTL_STRING(_vfs_generic_nfs_client, OID_AUTO, default_nfs4domain, CTLFLAG_RW | CTLFLAG_LOCKED, nfs4_default_domain, sizeof(nfs4_default_domain), "");
  #endif /* NFSCLIENT */
  
  #if NFSSERVER
@@ -189,11 +192,13 @@ SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOC
  #endif
  SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, "");
+SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_sock_idle_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_idle_timeout, 0, "");
+SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_tcp_connections, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsrv_sock_tcp_cnt, 0, "");
  #ifdef NFS_UC_Q_DEBUG
  SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, use_upcall_svc, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_use_proxy, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_limit, 0, "");
  SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, "");
-SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)&nfsrv_uc_queue_count, 0, "");
+SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, __DECONST(int *, &nfsrv_uc_queue_count), 0, "");
  #endif
  #endif /* NFSSERVER */
  
@@ -221,7 +226,7 @@ static int
  mapid2name(struct nfs_testmapid *map)
  {
         int error;
-       int len = sizeof(map->ntm_name);
+       size_t len = sizeof(map->ntm_name);
         
         if (map->ntm_grpflag)
                 error = kauth_cred_gid2guid((gid_t)map->ntm_id, &map->ntm_guid);
@@ -237,12 +242,12 @@ mapid2name(struct nfs_testmapid *map)
         
  }
  
-
  static int
  nfsclnt_testidmap(proc_t p, user_addr_t argp)
  {
         struct nfs_testmapid mapid;
         int error, coerror;
+       size_t len = sizeof(mapid.ntm_name);
                 
          /* Let root make this call. */
         error = proc_suser(p);
@@ -252,10 +257,22 @@ nfsclnt_testidmap(proc_t p, user_addr_t argp)
         error = copyin(argp, &mapid, sizeof(mapid));
         if (error)
                 return (error);
-       if (mapid.ntm_name2id)
+       switch (mapid.ntm_lookup) {
+       case NTM_NAME2ID:
                 error = mapname2id(&mapid);
-       else
+               break;
+       case NTM_ID2NAME:
                 error = mapid2name(&mapid);
+               break;
+       case NTM_NAME2GUID:
+               error = nfs4_id2guid(mapid.ntm_name, &mapid.ntm_guid, mapid.ntm_grpflag);
+               break;
+       case NTM_GUID2NAME:
+               error = nfs4_guid2id(&mapid.ntm_guid, mapid.ntm_name, &len, mapid.ntm_grpflag);
+               break;
+       default:
+               return (EINVAL);
+       }
  
         coerror = copyout(&mapid, argp, sizeof(mapid));
  
@@ -421,14 +438,19 @@ worktodo:
                 /* grab the current contents of the queue */
                 TAILQ_INIT(&iodq);
                 TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain);
+               /* Mark each iod request as being managed by an iod */
+               TAILQ_FOREACH(req, &iodq, r_achain) {
+                       lck_mtx_lock(&req->r_mtx);
+                       assert(!(req->r_flags & R_IOD));
+                       req->r_flags |= R_IOD;
+                       lck_mtx_unlock(&req->r_mtx);
+               }
                 lck_mtx_unlock(nfsiod_mutex);
  
                 /* process the queue */
                 TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) {
                         TAILQ_REMOVE(&iodq, req, r_achain);
-                       lck_mtx_lock(nfsiod_mutex);
-                       req->r_achain.tqe_next = NFSIODCOMPLETING;
-                       lck_mtx_unlock(nfsiod_mutex);
+                       req->r_achain.tqe_next = NFSREQNOLIST;
                         req->r_callback.rcb_func(req);
                 }
  
@@ -571,6 +593,11 @@ out:
         vnode_put(vp);
         if (error)
                 return (error);
+       /*
+        * At first blush, this may appear to leak a kernel stack
+        * address, but the copyout() never reaches &nfh.nfh_fhp
+        * (sizeof(fhandle_t) < sizeof(nfh)).
+        */
         error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t));
         return (error);
  }
@@ -659,6 +686,11 @@ fhopen( proc_t p,
                 goto bad;
         }
  
+#if CONFIG_MACF
+       if ((error = mac_vnode_check_open(ctx, vp, fmode)))
+               goto bad;
+#endif
+
         /* compute action to be authorized */
         action = 0;
         if (fmode & FREAD)
@@ -831,8 +863,12 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
         }
  
         /* Set protocol options and reserve some space (for UDP). */
-       if (sotype == SOCK_STREAM)
+       if (sotype == SOCK_STREAM) {
+               error = nfsrv_check_exports_allow_address(mynam);
+               if (error)
+                       return (error);
                 sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
+       }
         if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP))
                 sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
         if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
@@ -898,6 +934,58 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
         /* add the socket to the list */
         first = TAILQ_EMPTY(&nfsrv_socklist);
         TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain);
+       if (soprotocol == IPPROTO_TCP) {
+               nfsrv_sock_tcp_cnt++;
+               if (nfsrv_sock_idle_timeout < 0)
+                       nfsrv_sock_idle_timeout = 0;
+               if (nfsrv_sock_idle_timeout && (nfsrv_sock_idle_timeout < NFSD_MIN_IDLE_TIMEOUT))
+                       nfsrv_sock_idle_timeout = NFSD_MIN_IDLE_TIMEOUT;
+               /*
+                * Possibly start or stop the idle timer. We only start the idle timer when
+                * we have more than 2 * nfsd_thread_max connections. If the idle timer is
+                * on then we may need to turn it off based on the nvsrv_sock_idle_timeout or
+                * the number of connections.
+                */
+               if ((nfsrv_sock_tcp_cnt > 2 * nfsd_thread_max) || nfsrv_idlesock_timer_on) {
+                       if (nfsrv_sock_idle_timeout == 0 || nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) {
+                               if (nfsrv_idlesock_timer_on) {
+                                       thread_call_cancel(nfsrv_idlesock_timer_call);
+                                       nfsrv_idlesock_timer_on = 0;
+                               }
+                       } else {
+                               struct nfsrv_sock *old_slp;
+                               struct timeval now;
+                               time_t time_to_wait = nfsrv_sock_idle_timeout;
+                               /*
+                                * Get the oldest tcp socket and calculate the
+                                * earliest time for the next idle timer to fire
+                                * based on the possibly updated nfsrv_sock_idle_timeout
+                                */
+                               TAILQ_FOREACH(old_slp, &nfsrv_socklist, ns_chain) {
+                                       if (old_slp->ns_sotype == SOCK_STREAM) {
+                                               microuptime(&now);
+                                               time_to_wait -= now.tv_sec - old_slp->ns_timestamp;
+                                               if (time_to_wait < 1)
+                                                       time_to_wait = 1;
+                                               break;
+                                       }
+                               }
+                               /*
+                                * If we have a timer scheduled, but if its going to fire too late,
+                                * turn it off.
+                                */
+                               if (nfsrv_idlesock_timer_on > now.tv_sec + time_to_wait) {
+                                       thread_call_cancel(nfsrv_idlesock_timer_call);
+                                       nfsrv_idlesock_timer_on = 0;
+                               }
+                               /* Schedule the idle thread if it isn't already */
+                               if (!nfsrv_idlesock_timer_on) {
+                                       nfs_interval_timer_start(nfsrv_idlesock_timer_call, time_to_wait * 1000);
+                                       nfsrv_idlesock_timer_on = now.tv_sec + time_to_wait;
+                               }
+                       }
+               }
+       }
  
         sock_retain(so); /* grab a retain count on the socket */
         slp->ns_so = so;
@@ -909,7 +997,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
  
         /* mark that the socket is not in the nfsrv_sockwg list */
         slp->ns_wgq.tqe_next = SLPNOLIST;
-       
+
         slp->ns_flag = SLP_VALID | SLP_NEEDQ;
  
         nfsrv_wakenfsd(slp);
@@ -1057,6 +1145,11 @@ nfssvc_nfsd(void)
                         if (!nfsd->nfsd_slp && slp) {
                                 /* we found a socket to work on, grab a reference */
                                 slp->ns_sref++;
+                               microuptime(&now);
+                               slp->ns_timestamp = now.tv_sec;
+                               /* We keep the socket list in least recently used order for reaping idle sockets */
+                               TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
+                               TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain);
                                 nfsd->nfsd_slp = slp;
                                 opcnt = 0;
                                 /* and put it at the back of the work queue */
@@ -1376,15 +1469,7 @@ nfsrv_zapsock(struct nfsrv_sock *slp)
         if (so == NULL)
                 return;
  
-       /*
-        * Attempt to deter future up-calls, but leave the
-        * up-call info in place to avoid a race with the
-        * networking code.
-        */
-       socket_lock(so, 1);
-       so->so_rcv.sb_flags &= ~SB_UPCALL;
-       socket_unlock(so, 1);
-
+       sock_setupcall(so, NULL, NULL);
         sock_shutdown(so, SHUT_RDWR);
  
         /*
@@ -1416,9 +1501,6 @@ nfsrv_slpfree(struct nfsrv_sock *slp)
         slp->ns_nam = slp->ns_raw = slp->ns_rec = slp->ns_frag = NULL;
         slp->ns_reccnt = 0;
  
-       if (slp->ns_ua)
-               FREE(slp->ns_ua, M_NFSSVC);
-
         for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) {
                 nnwp = nwp->nd_tq.le_next;
                 LIST_REMOVE(nwp, nd_tq);
@@ -1444,12 +1526,9 @@ nfsrv_slpfree(struct nfsrv_sock *slp)
   * Derefence a server socket structure. If it has no more references and
   * is no longer valid, you can throw it away.
   */
-void
-nfsrv_slpderef(struct nfsrv_sock *slp)
+static void
+nfsrv_slpderef_locked(struct nfsrv_sock *slp)
  {
-       struct timeval now;
-
-       lck_mtx_lock(nfsd_mutex);
         lck_rw_lock_exclusive(&slp->ns_rwlock);
         slp->ns_sref--;
  
@@ -1463,7 +1542,6 @@ nfsrv_slpderef(struct nfsrv_sock *slp)
                         slp->ns_flag &= ~SLP_QUEUED;
                 }
                 lck_rw_done(&slp->ns_rwlock);
-               lck_mtx_unlock(nfsd_mutex);
                 return;
         }
  
@@ -1476,66 +1554,88 @@ nfsrv_slpderef(struct nfsrv_sock *slp)
                         TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
                 slp->ns_flag &= ~SLP_QUEUED;
         }
+       lck_rw_done(&slp->ns_rwlock);
  
-       /*
-        * Queue the socket up for deletion
-        * and start the timer to delete it
-        * after it has been in limbo for
-        * a while.
-        */
-       microuptime(&now);
-       slp->ns_timestamp = now.tv_sec;
         TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
-       TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
-       if (!nfsrv_deadsock_timer_on) {
-               nfsrv_deadsock_timer_on = 1;
-               nfs_interval_timer_start(nfsrv_deadsock_timer_call,
-                               NFSRV_DEADSOCKDELAY * 1000);
-       }
+       if (slp->ns_sotype == SOCK_STREAM)
+               nfsrv_sock_tcp_cnt--;
  
-       lck_rw_done(&slp->ns_rwlock);
         /* now remove from the write gather socket list */ 
         if (slp->ns_wgq.tqe_next != SLPNOLIST) {
                 TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
                 slp->ns_wgq.tqe_next = SLPNOLIST;
         }
+       nfsrv_slpfree(slp);
+}
+
+void
+nfsrv_slpderef(struct nfsrv_sock *slp)
+{
+       lck_mtx_lock(nfsd_mutex);
+       nfsrv_slpderef_locked(slp);
         lck_mtx_unlock(nfsd_mutex);
  }
  
  /*
- * Check periodically for dead sockets pending delete.
- * If a socket has been dead for more than NFSRV_DEADSOCKDELAY
- * seconds then we assume it's safe to free.
+ * Check periodically for idle sockest if needed and
+ * zap them.
   */
  void
-nfsrv_deadsock_timer(__unused void *param0, __unused void *param1)
+nfsrv_idlesock_timer(__unused void *param0, __unused void *param1)
  {
-       struct nfsrv_sock *slp;
+       struct nfsrv_sock *slp, *tslp;
         struct timeval now;
-       time_t time_to_wait;
+       time_t time_to_wait = nfsrv_sock_idle_timeout;
  
         microuptime(&now);
         lck_mtx_lock(nfsd_mutex);
  
-       while ((slp = TAILQ_FIRST(&nfsrv_deadsocklist))) {
-               if ((slp->ns_timestamp + NFSRV_DEADSOCKDELAY) > now.tv_sec)
-                       break;
-               TAILQ_REMOVE(&nfsrv_deadsocklist, slp, ns_chain);
-               nfsrv_slpfree(slp);
-       }
-       if (TAILQ_EMPTY(&nfsrv_deadsocklist)) {
-               nfsrv_deadsock_timer_on = 0;
+       /* Turn off the timer if we're suppose to and get out */
+       if (nfsrv_sock_idle_timeout < NFSD_MIN_IDLE_TIMEOUT)
+           nfsrv_sock_idle_timeout = 0;
+       if ((nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) || (nfsrv_sock_idle_timeout == 0)) {
+               nfsrv_idlesock_timer_on = 0;
                 lck_mtx_unlock(nfsd_mutex);
                 return;
         }
-       time_to_wait = (slp->ns_timestamp + NFSRV_DEADSOCKDELAY) - now.tv_sec;
-       if (time_to_wait < 1)
-               time_to_wait = 1;
  
-       lck_mtx_unlock(nfsd_mutex);
+       TAILQ_FOREACH_SAFE(slp, &nfsrv_socklist, ns_chain, tslp) {
+               lck_rw_lock_exclusive(&slp->ns_rwlock);
+               /* Skip udp and referenced sockets */
+               if (slp->ns_sotype == SOCK_DGRAM || slp->ns_sref) {
+                       lck_rw_done(&slp->ns_rwlock);
+                       continue;
+               }
+               /*
+                * If this is the first non-referenced socket that hasn't idle out,
+                * use its time stamp to calculate the earlist time in the future
+                * to start the next invocation of the timer. Since the nfsrv_socklist
+                * is sorted oldest access to newest. Once we find the first one,
+                * we're done and break out of the loop.
+                */
+               if (((slp->ns_timestamp + nfsrv_sock_idle_timeout)  >  now.tv_sec) ||
+                       nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) {
+                       time_to_wait -= now.tv_sec - slp->ns_timestamp;
+                       if (time_to_wait < 1)
+                               time_to_wait = 1;
+                       lck_rw_done(&slp->ns_rwlock);
+                       break;
+               }
+               /*
+                * Bump the ref count. nfsrv_slpderef below will destroy
+                * the socket, since nfsrv_zapsock has closed it.
+                */
+               slp->ns_sref++;
+               nfsrv_zapsock(slp);
+               lck_rw_done(&slp->ns_rwlock);
+               nfsrv_slpderef_locked(slp);
+       }
  
-       nfs_interval_timer_start(nfsrv_deadsock_timer_call,
-               time_to_wait * 1000);
+       /* Start ourself back up */
+       nfs_interval_timer_start(nfsrv_idlesock_timer_call, time_to_wait * 1000);
+       /* Remember when the next timer will fire for nfssvc_addsock. */
+       nfsrv_idlesock_timer_on = now.tv_sec + time_to_wait;
+       lck_mtx_unlock(nfsd_mutex);
  }
  
  /*
@@ -1554,33 +1654,14 @@ nfsrv_cleanup(void)
         microuptime(&now);
         for (slp = TAILQ_FIRST(&nfsrv_socklist); slp != 0; slp = nslp) {
                 nslp = TAILQ_NEXT(slp, ns_chain);
-               if (slp->ns_flag & SLP_VALID) {
-                       lck_rw_lock_exclusive(&slp->ns_rwlock);
+               lck_rw_lock_exclusive(&slp->ns_rwlock);
+               slp->ns_sref++;
+               if (slp->ns_flag & SLP_VALID)
                         nfsrv_zapsock(slp);
-                       lck_rw_done(&slp->ns_rwlock);
-               }
-               if (slp->ns_flag & SLP_QUEUED) {
-                       if (slp->ns_flag & SLP_WAITQ)
-                               TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
-                       else
-                               TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
-                       slp->ns_flag &= ~SLP_QUEUED;
-               }
-               if (slp->ns_wgq.tqe_next != SLPNOLIST) {
-                       TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
-                       slp->ns_wgq.tqe_next = SLPNOLIST;
-               }
-               /* queue the socket up for deletion */
-               slp->ns_timestamp = now.tv_sec;
-               TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
-               TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
-               if (!nfsrv_deadsock_timer_on) {
-                       nfsrv_deadsock_timer_on = 1;
-                       nfs_interval_timer_start(nfsrv_deadsock_timer_call,
-                               NFSRV_DEADSOCKDELAY * 1000);
-               }
+               lck_rw_done(&slp->ns_rwlock);
+               nfsrv_slpderef_locked(slp);
         }
-
+#
  #if CONFIG_FSE
         /*
          * Flush pending file write fsevents