X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/5d5c5d0d5b79ade9a973d55186ffda2638ba2b6e..3e170ce000f1506b7b5d2c5c7faec85ceabb573d:/bsd/nfs/nfs_syscalls.c diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index 3032fe683..12daa5588 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -1,31 +1,29 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2014 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_OSREFERENCE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the - * License may not be used to create, or enable the creation or - * redistribution of, unlawful or unlicensed copies of an Apple operating - * system, or to circumvent, violate, or enable the circumvention or - * violation of, any terms of an Apple operating system software license - * agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and * limitations under the License. - * - * @APPLE_LICENSE_OSREFERENCE_HEADER_END@ + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -66,13 +64,15 @@ * @(#)nfs_syscalls.c 8.5 (Berkeley) 3/30/95 * FreeBSD-Id: nfs_syscalls.c,v 1.32 1997/11/07 08:53:25 phk Exp $ */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ #include #include -/* XXX CSM 11/25/97 FreeBSD's generated syscall prototypes */ -#ifdef notyet -#include -#endif #include #include #include @@ -96,65 +96,395 @@ #include #include #include +#include #include +#include +#include -#include +#include #include #include -#if ISO -#include -#endif #include #include #include #include #include #include +#include #include #include -#include #include +#if CONFIG_MACF +#include +#endif + +kern_return_t thread_terminate(thread_t); /* XXX */ + +#if NFSSERVER + +extern int (*nfsrv_procs[NFS_NPROCS])(struct nfsrv_descript *nd, + struct nfsrv_sock *slp, + vfs_context_t ctx, + mbuf_t *mrepp); +extern int nfsrv_wg_delay; +extern int nfsrv_wg_delay_v3; + +static int nfsrv_require_resv_port = 0; +static time_t nfsrv_idlesock_timer_on = 0; +static int nfsrv_sock_tcp_cnt = 0; +#define NFSD_MIN_IDLE_TIMEOUT 30 +static int nfsrv_sock_idle_timeout = 3600; /* One hour */ -extern void unix_syscall_return(int); - -/* Global defs. */ -extern int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd, - struct nfssvc_sock *slp, - proc_t procp, - mbuf_t *mreqp); -extern int nfs_numasync; -extern int nfs_ioddelwri; -extern int nfsrtton; -extern struct nfsstats nfsstats; -extern int nfsrvw_procrastinate; -extern int nfsrvw_procrastinate_v3; - -struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; -static int nuidhash_max = NFS_MAXUIDHASH; - -static void nfsrv_zapsock(struct nfssvc_sock *slp); -static int nfssvc_iod(proc_t); -static int nfskerb_clientd(struct nfsmount *, struct nfsd_cargs *, int, user_addr_t, proc_t); - -static int nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; - -#ifndef NFS_NOSERVER -int nfsd_waiting = 0; -static struct nfsdrt nfsdrt; -int nfs_numnfsd = 0; -static void nfsd_rt(int sotype, struct nfsrv_descript *nd, int cacherep); -static int nfssvc_addsock(socket_t, mbuf_t, proc_t); -static int nfssvc_nfsd(struct nfsd_srvargs *,user_addr_t, proc_t); -static int nfssvc_export(user_addr_t, proc_t); - -static int nfs_privport = 0; -/* XXX CSM 11/25/97 Upgrade sysctl.h someday */ -#ifdef notyet -SYSCTL_INT(_vfs_nfs, NFS_NFSPRIVPORT, nfs_privport, CTLFLAG_RW, &nfs_privport, 0, ""); -SYSCTL_INT(_vfs_nfs, OID_AUTO, gatherdelay, CTLFLAG_RW, &nfsrvw_procrastinate, 0, ""); -SYSCTL_INT(_vfs_nfs, OID_AUTO, gatherdelay_v3, CTLFLAG_RW, &nfsrvw_procrastinate_v3, 0, ""); +int nfssvc_export(user_addr_t argp); +int nfssvc_nfsd(void); +int nfssvc_addsock(socket_t, mbuf_t); +void nfsrv_zapsock(struct nfsrv_sock *); +void nfsrv_slpderef(struct nfsrv_sock *); +void nfsrv_slpfree(struct nfsrv_sock *); + +#endif /* NFSSERVER */ + +/* + * sysctl stuff + */ +SYSCTL_DECL(_vfs_generic); +SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs hinge"); + +#if NFSCLIENT +SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs client hinge"); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_iosize, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_cache_timeout, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_allow_async, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_statfs_rate_limit, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsiod_thread_max, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_is_mobile, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, ""); +SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, readlink_nocache, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_readlink_nocache, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, root_steals_gss_context, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_root_steals_ctx, 0, ""); +#endif /* NFSCLIENT */ + +#if NFSSERVER +SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs server hinge"); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_require_resv_port, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_async, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_export_hash_size, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_reqcache_size, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_max_rec_queue_length, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_user_stat_enabled, 0, ""); +SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_gss_context_ttl, 0, ""); +#if CONFIG_FSE +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_fsevents_enabled, 0, ""); #endif +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_sock_idle_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_idle_timeout, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_tcp_connections, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsrv_sock_tcp_cnt, 0, ""); +#ifdef NFS_UC_Q_DEBUG +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, use_upcall_svc, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_use_proxy, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_limit, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, __DECONST(int *, &nfsrv_uc_queue_count), 0, ""); +#endif +#endif /* NFSSERVER */ + + +#if NFSCLIENT + +static int +mapname2id(struct nfs_testmapid *map) +{ + int error; + + error = nfs4_id2guid(map->ntm_name, &map->ntm_guid, map->ntm_grpflag); + if (error) + return (error); + + if (map->ntm_grpflag) + error = kauth_cred_guid2gid(&map->ntm_guid, (gid_t *)&map->ntm_id); + else + error = kauth_cred_guid2uid(&map->ntm_guid, (uid_t *)&map->ntm_id); + + return (error); +} + +static int +mapid2name(struct nfs_testmapid *map) +{ + int error; + int len = sizeof(map->ntm_name); + + if (map->ntm_grpflag) + error = kauth_cred_gid2guid((gid_t)map->ntm_id, &map->ntm_guid); + else + error = kauth_cred_uid2guid((uid_t)map->ntm_id, &map->ntm_guid); + + if (error) + return (error); + + error = nfs4_guid2id(&map->ntm_guid, map->ntm_name, &len, map->ntm_grpflag); + + return (error); + +} + + +static int +nfsclnt_testidmap(proc_t p, user_addr_t argp) +{ + struct nfs_testmapid mapid; + int error, coerror; + + /* Let root make this call. */ + error = proc_suser(p); + if (error) + return (error); + + error = copyin(argp, &mapid, sizeof(mapid)); + if (error) + return (error); + if (mapid.ntm_name2id) + error = mapname2id(&mapid); + else + error = mapid2name(&mapid); + + coerror = copyout(&mapid, argp, sizeof(mapid)); + + return (error ? error : coerror); +} + +int +nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) +{ + struct lockd_ans la; + int error; + + switch (uap->flag) { + case NFSCLNT_LOCKDANS: + error = copyin(uap->argp, &la, sizeof(la)); + if (!error) + error = nfslockdans(p, &la); + break; + case NFSCLNT_LOCKDNOTIFY: + error = nfslockdnotify(p, uap->argp); + break; + case NFSCLNT_TESTIDMAP: + error = nfsclnt_testidmap(p, uap->argp); + break; + default: + error = EINVAL; + } + return (error); +} + + +/* + * Asynchronous I/O threads for client NFS. + * They do read-ahead and write-behind operations on the block I/O cache. + * + * The pool of up to nfsiod_thread_max threads is launched on demand and exit + * when unused for a while. There are as many nfsiod structs as there are + * nfsiod threads; however there's no strict tie between a thread and a struct. + * Each thread puts an nfsiod on the free list and sleeps on it. When it wakes + * up, it removes the next struct nfsiod from the queue and services it. Then + * it will put the struct at the head of free list and sleep on it. + * Async requests will pull the next struct nfsiod from the head of the free list, + * put it on the work queue, and wake whatever thread is waiting on that struct. + */ + +/* + * nfsiod thread exit routine + * + * Must be called with nfsiod_mutex held so that the + * decision to terminate is atomic with the termination. + */ +void +nfsiod_terminate(struct nfsiod *niod) +{ + nfsiod_thread_count--; + lck_mtx_unlock(nfsiod_mutex); + if (niod) + FREE(niod, M_TEMP); + else + printf("nfsiod: terminating without niod\n"); + thread_terminate(current_thread()); + /*NOTREACHED*/ +} + +/* nfsiod thread startup routine */ +void +nfsiod_thread(void) +{ + struct nfsiod *niod; + int error; + + MALLOC(niod, struct nfsiod *, sizeof(struct nfsiod), M_TEMP, M_WAITOK); + if (!niod) { + lck_mtx_lock(nfsiod_mutex); + nfsiod_thread_count--; + wakeup(current_thread()); + lck_mtx_unlock(nfsiod_mutex); + thread_terminate(current_thread()); + /*NOTREACHED*/ + } + bzero(niod, sizeof(*niod)); + lck_mtx_lock(nfsiod_mutex); + TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link); + wakeup(current_thread()); + error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue); + /* shouldn't return... so we have an error */ + /* remove an old nfsiod struct and terminate */ + lck_mtx_lock(nfsiod_mutex); + if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) + TAILQ_REMOVE(&nfsiodfree, niod, niod_link); + nfsiod_terminate(niod); + /*NOTREACHED*/ +} + +/* + * Start up another nfsiod thread. + * (unless we're already maxed out and there are nfsiods running) + */ +int +nfsiod_start(void) +{ + thread_t thd = THREAD_NULL; + + lck_mtx_lock(nfsiod_mutex); + if ((nfsiod_thread_count >= NFSIOD_MAX) && (nfsiod_thread_count > 0)) { + lck_mtx_unlock(nfsiod_mutex); + return (EBUSY); + } + nfsiod_thread_count++; + if (kernel_thread_start((thread_continue_t)nfsiod_thread, NULL, &thd) != KERN_SUCCESS) { + lck_mtx_unlock(nfsiod_mutex); + return (EBUSY); + } + /* wait for the thread to complete startup */ + msleep(thd, nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL); + thread_deallocate(thd); + return (0); +} + +/* + * Continuation for Asynchronous I/O threads for NFS client. + * + * Grab an nfsiod struct to work on, do some work, then drop it + */ +int +nfsiod_continue(int error) +{ + struct nfsiod *niod; + struct nfsmount *nmp; + struct nfsreq *req, *treq; + struct nfs_reqqhead iodq; + int morework; + + lck_mtx_lock(nfsiod_mutex); + niod = TAILQ_FIRST(&nfsiodwork); + if (!niod) { + /* there's no work queued up */ + /* remove an old nfsiod struct and terminate */ + if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) + TAILQ_REMOVE(&nfsiodfree, niod, niod_link); + nfsiod_terminate(niod); + /*NOTREACHED*/ + } + TAILQ_REMOVE(&nfsiodwork, niod, niod_link); + +worktodo: + while ((nmp = niod->niod_nmp)) { + if (nmp == NULL){ + niod->niod_nmp = NULL; + break; + } + + /* + * Service this mount's async I/O queue. + * + * In order to ensure some level of fairness between mounts, + * we grab all the work up front before processing it so any + * new work that arrives will be serviced on a subsequent + * iteration - and we have a chance to see if other work needs + * to be done (e.g. the delayed write queue needs to be pushed + * or other mounts are waiting for an nfsiod). + */ + /* grab the current contents of the queue */ + TAILQ_INIT(&iodq); + TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain); + /* Mark each iod request as being managed by an iod */ + TAILQ_FOREACH(req, &iodq, r_achain) { + lck_mtx_lock(&req->r_mtx); + assert(!(req->r_flags & R_IOD)); + req->r_flags |= R_IOD; + lck_mtx_unlock(&req->r_mtx); + } + lck_mtx_unlock(nfsiod_mutex); + + /* process the queue */ + TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) { + TAILQ_REMOVE(&iodq, req, r_achain); + req->r_achain.tqe_next = NFSREQNOLIST; + req->r_callback.rcb_func(req); + } + + /* now check if there's more/other work to be done */ + lck_mtx_lock(nfsiod_mutex); + morework = !TAILQ_EMPTY(&nmp->nm_iodq); + if (!morework || !TAILQ_EMPTY(&nfsiodmounts)) { + /* + * we're going to stop working on this mount but if the + * mount still needs more work so queue it up + */ + if (morework && nmp->nm_iodlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); + nmp->nm_niod = NULL; + niod->niod_nmp = NULL; + } + } + + /* loop if there's still a mount to work on */ + if (!niod->niod_nmp && !TAILQ_EMPTY(&nfsiodmounts)) { + niod->niod_nmp = TAILQ_FIRST(&nfsiodmounts); + TAILQ_REMOVE(&nfsiodmounts, niod->niod_nmp, nm_iodlink); + niod->niod_nmp->nm_iodlink.tqe_next = NFSNOLIST; + } + if (niod->niod_nmp) + goto worktodo; + + /* queue ourselves back up - if there aren't too many threads running */ + if (nfsiod_thread_count <= NFSIOD_MAX) { + TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link); + error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue); + /* shouldn't return... so we have an error */ + /* remove an old nfsiod struct and terminate */ + lck_mtx_lock(nfsiod_mutex); + if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) + TAILQ_REMOVE(&nfsiodfree, niod, niod_link); + } + nfsiod_terminate(niod); + /*NOTREACHED*/ + return (0); +} + +#endif /* NFSCLIENT */ + + +#if NFSSERVER /* * NFS server system calls @@ -169,17 +499,13 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) { vnode_t vp; struct nfs_filehandle nfh; - int error; + int error, fhlen, fidlen; struct nameidata nd; - struct vfs_context context; char path[MAXPATHLEN], *ptr; - u_int pathlen; + size_t pathlen; struct nfs_exportfs *nxfs; struct nfs_export *nx; - context.vc_proc = p; - context.vc_ucred = kauth_cred_get(); - /* * Must be super user */ @@ -187,12 +513,21 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) if (error) return (error); - error = copyinstr(uap->fname, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(uap->fname, path, MAXPATHLEN, &pathlen); + if (!error) + error = copyin(uap->fhp, &fhlen, sizeof(fhlen)); if (error) return (error); + /* limit fh size to length specified (or v3 size by default) */ + if ((fhlen != NFSV2_MAX_FH_SIZE) && (fhlen != NFSV3_MAX_FH_SIZE)) + fhlen = NFSV3_MAX_FH_SIZE; + fidlen = fhlen - sizeof(struct nfs_exphandle); + + if (!nfsrv_is_initialized()) + return (EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - UIO_SYSSPACE, path, &context); + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + UIO_SYSSPACE, CAST_USER_ADDR_T(path), vfs_context_current()); error = namei(&nd); if (error) return (error); @@ -201,10 +536,10 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) vp = nd.ni_vp; // find exportfs that matches f_mntonname - lck_rw_lock_shared(&nfs_export_rwlock); + lck_rw_lock_shared(&nfsrv_export_rwlock); ptr = vnode_mount(vp)->mnt_vfsstat.f_mntonname; - LIST_FOREACH(nxfs, &nfs_exports, nxfs_next) { - if (!strcmp(nxfs->nxfs_path, ptr)) + LIST_FOREACH(nxfs, &nfsrv_exports, nxfs_next) { + if (!strncmp(nxfs->nxfs_path, ptr, MAXPATHLEN)) break; } if (!nxfs || strncmp(nxfs->nxfs_path, path, strlen(nxfs->nxfs_path))) { @@ -233,25 +568,24 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) nfh.nfh_xh.nxh_expid = htonl(nx->nx_id); nfh.nfh_xh.nxh_flags = 0; nfh.nfh_xh.nxh_reserved = 0; - nfh.nfh_len = NFS_MAX_FID_SIZE; - error = VFS_VPTOFH(vp, &nfh.nfh_len, &nfh.nfh_fid[0], NULL); - if (nfh.nfh_len > (int)NFS_MAX_FID_SIZE) + nfh.nfh_len = fidlen; + error = VFS_VPTOFH(vp, (int*)&nfh.nfh_len, &nfh.nfh_fid[0], NULL); + if (nfh.nfh_len > (uint32_t)fidlen) error = EOVERFLOW; nfh.nfh_xh.nxh_fidlen = nfh.nfh_len; nfh.nfh_len += sizeof(nfh.nfh_xh); + nfh.nfh_fhp = (u_char*)&nfh.nfh_xh; out: - lck_rw_done(&nfs_export_rwlock); + lck_rw_done(&nfsrv_export_rwlock); vnode_put(vp); if (error) return (error); - error = copyout((caddr_t)&nfh, uap->fhp, sizeof(nfh)); + error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t)); return (error); } -#endif /* NFS_NOSERVER */ - -extern struct fileops vnops; +extern const struct fileops vnops; /* * syscall for the rpc.lockd to use to translate a NFS file handle into @@ -263,7 +597,7 @@ extern struct fileops vnops; int fhopen( proc_t p, struct fhopen_args *uap, - register_t *retval) + int32_t *retval) { vnode_t vp; struct nfs_filehandle nfh; @@ -273,19 +607,20 @@ fhopen( proc_t p, struct fileproc *fp, *nfp; int fmode, error, type; int indx; - kauth_cred_t cred = proc_ucred(p); - struct vfs_context context; + vfs_context_t ctx = vfs_context_current(); kauth_action_t action; - context.vc_proc = p; - context.vc_ucred = cred; - /* * Must be super user */ - error = suser(cred, 0); - if (error) + error = suser(vfs_context_ucred(ctx), 0); + if (error) { return (error); + } + + if (!nfsrv_is_initialized()) { + return (EINVAL); + } fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ @@ -296,18 +631,22 @@ fhopen( proc_t p, if (error) return (error); if ((nfh.nfh_len < (int)sizeof(struct nfs_exphandle)) || - (nfh.nfh_len > (int)NFS_MAX_FH_SIZE)) + (nfh.nfh_len > (int)NFSV3_MAX_FH_SIZE)) return (EINVAL); error = copyin(uap->u_fhp, &nfh, sizeof(nfh.nfh_len) + nfh.nfh_len); if (error) return (error); + nfh.nfh_fhp = (u_char*)&nfh.nfh_xh; - lck_rw_lock_shared(&nfs_export_rwlock); + lck_rw_lock_shared(&nfsrv_export_rwlock); /* now give me my vnode, it gets returned to me with a reference */ - error = nfsrv_fhtovp(&nfh, NULL, TRUE, &vp, &nx, &nxo); - lck_rw_done(&nfs_export_rwlock); - if (error) + error = nfsrv_fhtovp(&nfh, NULL, &vp, &nx, &nxo); + lck_rw_done(&nfsrv_export_rwlock); + if (error) { + if (error == NFSERR_TRYLATER) + error = EAGAIN; // XXX EBUSY? Or just leave as TRYLATER? return (error); + } /* * From now on we have to make sure not @@ -336,12 +675,12 @@ fhopen( proc_t p, action |= KAUTH_VNODE_READ_DATA; if (fmode & (FWRITE | O_TRUNC)) action |= KAUTH_VNODE_WRITE_DATA; - if ((error = vnode_authorize(vp, NULL, action, &context)) != 0) + if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) goto bad; - if ((error = VNOP_OPEN(vp, fmode, &context))) + if ((error = VNOP_OPEN(vp, fmode, ctx))) goto bad; - if ((error = vnode_ref_ext(vp, fmode))) + if ((error = vnode_ref_ext(vp, fmode, 0))) goto bad; /* @@ -349,14 +688,13 @@ fhopen( proc_t p, */ // starting here... error paths should call vn_close/vnode_put - if ((error = falloc(p, &nfp, &indx)) != 0) { - vn_close(vp, fmode & FMASK, cred, p); + if ((error = falloc(p, &nfp, &indx, ctx)) != 0) { + vn_close(vp, fmode & FMASK, ctx); goto bad; } fp = nfp; fp->f_fglob->fg_flag = fmode & FMASK; - fp->f_fglob->fg_type = DTYPE_VNODE; fp->f_fglob->fg_ops = &vnops; fp->f_fglob->fg_data = (caddr_t)vp; @@ -372,8 +710,12 @@ fhopen( proc_t p, type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; - if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, &context))) { - vn_close(vp, fp->f_fglob->fg_flag, fp->f_fglob->fg_cred, p); + if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) { + struct vfs_context context = *vfs_context_current(); + /* Modify local copy (to not damage thread copy) */ + context.vc_ucred = fp->f_fglob->fg_cred; + + vn_close(vp, fp->f_fglob->fg_flag, &context); fp_free(p, indx, fp); return (error); } @@ -383,7 +725,7 @@ fhopen( proc_t p, vnode_put(vp); proc_fdlock(p); - *fdflags(p, indx) &= ~UF_RESERVED; + procfdtbl_releasefd(p, indx, NULL); fp_drop(p, indx, fp, 1); proc_fdunlock(p); @@ -396,76 +738,33 @@ bad: } /* - * Nfs server psuedo system call for the nfsd's - * Based on the flag value it either: - * - adds a socket to the selection list - * - remains in the kernel as an nfsd - * - remains in the kernel as an nfsiod + * NFS server pseudo system call */ int nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval) { -#ifndef NFS_NOSERVER - struct nameidata nd; mbuf_t nam; struct user_nfsd_args user_nfsdarg; - struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs; - struct nfsd_cargs ncd; - struct nfsd *nfsd; - struct nfssvc_sock *slp; - struct nfsuid *nuidp; - struct nfsmount *nmp; - struct timeval now; socket_t so; - struct vfs_context context; - struct ucred temp_cred; -#endif /* NFS_NOSERVER */ int error; AUDIT_ARG(cmd, uap->flag); /* - * Must be super user + * Must be super user for most operations (export ops checked later). */ - error = proc_suser(p); - if(error) + if ((uap->flag != NFSSVC_EXPORT) && ((error = proc_suser(p)))) return (error); - if (uap->flag & NFSSVC_BIOD) - error = nfssvc_iod(p); -#ifdef NFS_NOSERVER - else - error = ENXIO; -#else /* !NFS_NOSERVER */ - else if (uap->flag & NFSSVC_MNTD) { - - context.vc_proc = p; - context.vc_ucred = kauth_cred_get(); - - error = copyin(uap->argp, (caddr_t)&ncd, sizeof (ncd)); - if (error) - return (error); - - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - (proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), - CAST_USER_ADDR_T(ncd.ncd_dirp), &context); - error = namei(&nd); - if (error) - return (error); - nameidone(&nd); +#if CONFIG_MACF + error = mac_system_check_nfsd(kauth_cred_get()); + if (error) + return (error); +#endif - if (vnode_isvroot(nd.ni_vp) == 0) - error = EINVAL; - nmp = VFSTONFS(vnode_mount(nd.ni_vp)); - vnode_put(nd.ni_vp); - if (error) - return (error); + /* make sure NFS server data structures have been initialized */ + nfsrv_init(); - if ((nmp->nm_state & NFSSTA_MNTD) && - (uap->flag & NFSSVC_GOTAUTH) == 0) - return (0); - nmp->nm_state |= NFSSTA_MNTD; - error = nfskerb_clientd(nmp, &ncd, uap->flag, uap->argp, p); - } else if (uap->flag & NFSSVC_ADDSOCK) { + if (uap->flag & NFSSVC_ADDSOCK) { if (IS_64BIT_PROCESS(p)) { error = copyin(uap->argp, (caddr_t)&user_nfsdarg, sizeof(user_nfsdarg)); } else { @@ -499,334 +798,185 @@ nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval) * to keep the socket from being closed when nfsd closes its * file descriptor for it. */ - error = nfssvc_addsock(so, nam, p); + error = nfssvc_addsock(so, nam); /* drop the iocount file_socket() grabbed on the file descriptor */ file_drop(user_nfsdarg.sock); } else if (uap->flag & NFSSVC_NFSD) { - error = copyin(uap->argp, (caddr_t)nsd, sizeof (*nsd)); - if (error) - return (error); - - if ((uap->flag & NFSSVC_AUTHIN) && ((nfsd = nsd->nsd_nfsd)) && - (nfsd->nfsd_slp->ns_flag & SLP_VALID)) { - slp = nfsd->nfsd_slp; - - /* - * First check to see if another nfsd has already - * added this credential. - */ - for (nuidp = NUIDHASH(slp,nsd->nsd_cr.cr_uid)->lh_first; - nuidp != 0; nuidp = nuidp->nu_hash.le_next) { - if (kauth_cred_getuid(nuidp->nu_cr) == nsd->nsd_cr.cr_uid && - (!nfsd->nfsd_nd->nd_nam2 || - netaddr_match(NU_NETFAM(nuidp), - &nuidp->nu_haddr, nfsd->nfsd_nd->nd_nam2))) - break; - } - if (nuidp) { - nfsrv_setcred(nuidp->nu_cr,nfsd->nfsd_nd->nd_cr); - nfsd->nfsd_nd->nd_flag |= ND_KERBFULL; - } else { - /* - * Nope, so we will. - */ - if (slp->ns_numuids < nuidhash_max) { - slp->ns_numuids++; - nuidp = (struct nfsuid *) - _MALLOC_ZONE(sizeof (struct nfsuid), - M_NFSUID, M_WAITOK); - } else - nuidp = (struct nfsuid *)0; - if ((slp->ns_flag & SLP_VALID) == 0) { - if (nuidp) { - FREE_ZONE((caddr_t)nuidp, - sizeof (struct nfsuid), M_NFSUID); - slp->ns_numuids--; - } - } else { - if (nuidp == (struct nfsuid *)0) { - nuidp = slp->ns_uidlruhead.tqh_first; - if (!nuidp) - return (ENOMEM); - LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, - nu_lru); - if (nuidp->nu_flag & NU_NAM) - mbuf_freem(nuidp->nu_nam); - kauth_cred_rele(nuidp->nu_cr); - } - nuidp->nu_flag = 0; - - if (nsd->nsd_cr.cr_ngroups > NGROUPS) - nsd->nsd_cr.cr_ngroups = NGROUPS; - - nfsrv_setcred(&nsd->nsd_cr, &temp_cred); - nuidp->nu_cr = kauth_cred_create(&temp_cred); - - if (!nuidp->nu_cr) { - FREE_ZONE(nuidp, sizeof(struct nfsuid), M_NFSUID); - slp->ns_numuids--; - return (ENOMEM); - } - nuidp->nu_timestamp = nsd->nsd_timestamp; - microtime(&now); - nuidp->nu_expire = now.tv_sec + nsd->nsd_ttl; - /* - * and save the session key in nu_key. - */ - bcopy(nsd->nsd_key, nuidp->nu_key, - sizeof (nsd->nsd_key)); - if (nfsd->nfsd_nd->nd_nam2) { - struct sockaddr_in *saddr; - - saddr = mbuf_data(nfsd->nfsd_nd->nd_nam2); - switch (saddr->sin_family) { - case AF_INET: - nuidp->nu_flag |= NU_INETADDR; - nuidp->nu_inetaddr = - saddr->sin_addr.s_addr; - break; - case AF_ISO: - default: - nuidp->nu_flag |= NU_NAM; - error = mbuf_copym(nfsd->nfsd_nd->nd_nam2, 0, - MBUF_COPYALL, MBUF_WAITOK, - &nuidp->nu_nam); - if (error) { - kauth_cred_rele(nuidp->nu_cr); - FREE_ZONE(nuidp, sizeof(struct nfsuid), M_NFSUID); - slp->ns_numuids--; - return (error); - } - break; - }; - } - TAILQ_INSERT_TAIL(&slp->ns_uidlruhead, nuidp, - nu_lru); - LIST_INSERT_HEAD(NUIDHASH(slp, nsd->nsd_uid), - nuidp, nu_hash); - nfsrv_setcred(nuidp->nu_cr, - nfsd->nfsd_nd->nd_cr); - nfsd->nfsd_nd->nd_flag |= ND_KERBFULL; - } - } - } - if ((uap->flag & NFSSVC_AUTHINFAIL) && (nfsd = nsd->nsd_nfsd)) - nfsd->nfsd_flag |= NFSD_AUTHFAIL; - error = nfssvc_nfsd(nsd, uap->argp, p); + error = nfssvc_nfsd(); } else if (uap->flag & NFSSVC_EXPORT) { - error = nfssvc_export(uap->argp, p); + error = nfssvc_export(uap->argp); } else { error = EINVAL; } -#endif /* NFS_NOSERVER */ if (error == EINTR || error == ERESTART) error = 0; return (error); } -/* - * NFSKERB client helper daemon. - * Gets authorization strings for "kerb" mounts. - */ -static int -nfskerb_clientd( - struct nfsmount *nmp, - struct nfsd_cargs *ncd, - int flag, - user_addr_t argp, - proc_t p) -{ - struct nfsuid *nuidp, *nnuidp; - int error = 0; - struct nfsreq *rp; - struct timeval now; - - /* - * First initialize some variables - */ - microtime(&now); - - /* - * If an authorization string is being passed in, get it. - */ - if ((flag & NFSSVC_GOTAUTH) && (nmp->nm_state & NFSSTA_MOUNTED) && - ((nmp->nm_state & NFSSTA_WAITAUTH) == 0)) { - if (nmp->nm_state & NFSSTA_HASAUTH) - panic("cld kerb"); - if ((flag & NFSSVC_AUTHINFAIL) == 0) { - if (ncd->ncd_authlen <= nmp->nm_authlen && - ncd->ncd_verflen <= nmp->nm_verflen && - !copyin(CAST_USER_ADDR_T(ncd->ncd_authstr),nmp->nm_authstr,ncd->ncd_authlen)&& - !copyin(CAST_USER_ADDR_T(ncd->ncd_verfstr),nmp->nm_verfstr,ncd->ncd_verflen)){ - nmp->nm_authtype = ncd->ncd_authtype; - nmp->nm_authlen = ncd->ncd_authlen; - nmp->nm_verflen = ncd->ncd_verflen; -#if NFSKERB - nmp->nm_key = ncd->ncd_key; -#endif - } else - nmp->nm_state |= NFSSTA_AUTHERR; - } else - nmp->nm_state |= NFSSTA_AUTHERR; - nmp->nm_state |= NFSSTA_HASAUTH; - wakeup((caddr_t)&nmp->nm_authlen); - } else { - nmp->nm_state |= NFSSTA_WAITAUTH; - } - - /* - * Loop every second updating queue until there is a termination sig. - */ - while (nmp->nm_state & NFSSTA_MOUNTED) { - /* Get an authorization string, if required. */ - if ((nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_HASAUTH)) == 0) { - ncd->ncd_authuid = nmp->nm_authuid; - if (copyout((caddr_t)ncd, argp, sizeof (struct nfsd_cargs))) - nmp->nm_state |= NFSSTA_WAITAUTH; - else - return (ENEEDAUTH); - } - /* Wait a bit (no pun) and do it again. */ - if ((nmp->nm_state & NFSSTA_MOUNTED) && - (nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_HASAUTH))) { - error = tsleep((caddr_t)&nmp->nm_authstr, PSOCK | PCATCH, - "nfskrbtimr", hz / 3); - if (error == EINTR || error == ERESTART) - dounmount(nmp->nm_mountp, 0, NULL, p); - } - } - - /* - * Finally, we can free up the mount structure. - */ - for (nuidp = nmp->nm_uidlruhead.tqh_first; nuidp != 0; nuidp = nnuidp) { - nnuidp = nuidp->nu_lru.tqe_next; - LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); - kauth_cred_rele(nuidp->nu_cr); - FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); - } - /* - * Loop through outstanding request list and remove dangling - * references to defunct nfsmount struct - */ - for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) - if (rp->r_nmp == nmp) - rp->r_nmp = (struct nfsmount *)0; - /* Need to wake up any rcvlock waiters so they notice the unmount. */ - if (nmp->nm_state & NFSSTA_WANTRCV) { - nmp->nm_state &= ~NFSSTA_WANTRCV; - wakeup(&nmp->nm_state); - } - FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); - if (error == EWOULDBLOCK) - error = 0; - return (error); -} - -#ifndef NFS_NOSERVER /* * Adds a socket to the list for servicing by nfsds. */ -static int -nfssvc_addsock( - socket_t so, - mbuf_t mynam, - __unused proc_t p) +int +nfssvc_addsock(socket_t so, mbuf_t mynam) { - int siz; - struct nfssvc_sock *slp; - struct nfssvc_sock *tslp = NULL; - int error, sodomain, sotype, soprotocol, on = 1; + struct nfsrv_sock *slp; + int error = 0, sodomain, sotype, soprotocol, on = 1; + int first; struct timeval timeo; /* make sure mbuf constants are set up */ - if (!nfs_mbuf_mlen) + if (!nfs_mbuf_mhlen) nfs_mbuf_init(); sock_gettype(so, &sodomain, &sotype, &soprotocol); - /* - * Add it to the list, as required. - */ - if (soprotocol == IPPROTO_UDP) { - tslp = nfs_udpsock; - if (!tslp || (tslp->ns_flag & SLP_VALID)) { - mbuf_freem(mynam); - return (EPERM); - } -#if ISO - } else if (soprotocol == ISOPROTO_CLTP) { - tslp = nfs_cltpsock; - if (!tslp || (tslp->ns_flag & SLP_VALID)) { - mbuf_freem(mynam); - return (EPERM); - } -#endif /* ISO */ + /* There should be only one UDP socket for each of IPv4 and IPv6 */ + if ((sodomain == AF_INET) && (soprotocol == IPPROTO_UDP) && nfsrv_udpsock) { + mbuf_freem(mynam); + return (EEXIST); } - /* reserve buffer space for 2 maximally-sized packets */ - siz = NFS_MAXPACKET; - if (sotype == SOCK_STREAM) - siz += sizeof (u_long); - siz *= 2; - if (siz > NFS_MAXSOCKBUF) - siz = NFS_MAXSOCKBUF; - if ((error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &siz, sizeof(siz))) || - (error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &siz, sizeof(siz)))) { + if ((sodomain == AF_INET6) && (soprotocol == IPPROTO_UDP) && nfsrv_udp6sock) { mbuf_freem(mynam); - return (error); + return (EEXIST); } - /* - * Set protocol specific options { for now TCP only } and - * reserve some space. For datagram sockets, this can get called - * repeatedly for the same socket, but that isn't harmful. - */ + /* Set protocol options and reserve some space (for UDP). */ if (sotype == SOCK_STREAM) { + error = nfsrv_check_exports_allow_address(mynam); + if (error) + return (error); sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); } - if (sodomain == AF_INET && soprotocol == IPPROTO_TCP) { + if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP)) sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ + int reserve = NFS_UDPSOCKBUF; + error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); + error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); + if (error) { + log(LOG_INFO, "nfssvc_addsock: UDP socket buffer setting error(s) %d\n", error); + error = 0; + } } - sock_nointerrupt(so, 0); + /* + * Set socket send/receive timeouts. + * Receive timeout shouldn't matter, but setting the send timeout + * will make sure that an unresponsive client can't hang the server. + */ timeo.tv_usec = 0; - timeo.tv_sec = 0; - error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); - error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); - - if (tslp) { - slp = tslp; - lck_mtx_lock(nfsd_mutex); - } else { - MALLOC(slp, struct nfssvc_sock *, sizeof(struct nfssvc_sock), - M_NFSSVC, M_WAITOK); - if (!slp) { - mbuf_freem(mynam); - return (ENOMEM); - } - bzero((caddr_t)slp, sizeof (struct nfssvc_sock)); - lck_rw_init(&slp->ns_rwlock, nfs_slp_rwlock_group, nfs_slp_lock_attr); - lck_mtx_init(&slp->ns_wgmutex, nfs_slp_mutex_group, nfs_slp_lock_attr); - TAILQ_INIT(&slp->ns_uidlruhead); - lck_mtx_lock(nfsd_mutex); - TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain); + timeo.tv_sec = 1; + error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + timeo.tv_sec = 30; + error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (error) { + log(LOG_INFO, "nfssvc_addsock: socket timeout setting error(s) %d\n", error); + error = 0; } - sock_retain(so); /* grab a retain count on the socket */ - slp->ns_so = so; - slp->ns_sotype = sotype; - slp->ns_nam = mynam; + MALLOC(slp, struct nfsrv_sock *, sizeof(struct nfsrv_sock), M_NFSSVC, M_WAITOK); + if (!slp) { + mbuf_freem(mynam); + return (ENOMEM); + } + bzero((caddr_t)slp, sizeof (struct nfsrv_sock)); + lck_rw_init(&slp->ns_rwlock, nfsrv_slp_rwlock_group, LCK_ATTR_NULL); + lck_mtx_init(&slp->ns_wgmutex, nfsrv_slp_mutex_group, LCK_ATTR_NULL); - socket_lock(so, 1); - so->so_upcallarg = (caddr_t)slp; - so->so_upcall = nfsrv_rcv; - so->so_rcv.sb_flags |= SB_UPCALL; /* required for freebsd merge */ - socket_unlock(so, 1); + lck_mtx_lock(nfsd_mutex); - slp->ns_flag = SLP_VALID | SLP_NEEDQ; + if (soprotocol == IPPROTO_UDP) { + if (sodomain == AF_INET) { + /* There should be only one UDP/IPv4 socket */ + if (nfsrv_udpsock) { + lck_mtx_unlock(nfsd_mutex); + nfsrv_slpfree(slp); + mbuf_freem(mynam); + return (EEXIST); + } + nfsrv_udpsock = slp; + } + if (sodomain == AF_INET6) { + /* There should be only one UDP/IPv6 socket */ + if (nfsrv_udp6sock) { + lck_mtx_unlock(nfsd_mutex); + nfsrv_slpfree(slp); + mbuf_freem(mynam); + return (EEXIST); + } + nfsrv_udp6sock = slp; + } + } + + /* add the socket to the list */ + first = TAILQ_EMPTY(&nfsrv_socklist); + TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain); + if (soprotocol == IPPROTO_TCP) { + nfsrv_sock_tcp_cnt++; + if (nfsrv_sock_idle_timeout < 0) + nfsrv_sock_idle_timeout = 0; + if (nfsrv_sock_idle_timeout && (nfsrv_sock_idle_timeout < NFSD_MIN_IDLE_TIMEOUT)) + nfsrv_sock_idle_timeout = NFSD_MIN_IDLE_TIMEOUT; + /* + * Possibly start or stop the idle timer. We only start the idle timer when + * we have more than 2 * nfsd_thread_max connections. If the idle timer is + * on then we may need to turn it off based on the nvsrv_sock_idle_timeout or + * the number of connections. + */ + if ((nfsrv_sock_tcp_cnt > 2 * nfsd_thread_max) || nfsrv_idlesock_timer_on) { + if (nfsrv_sock_idle_timeout == 0 || nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) { + if (nfsrv_idlesock_timer_on) { + thread_call_cancel(nfsrv_idlesock_timer_call); + nfsrv_idlesock_timer_on = 0; + } + } else { + struct nfsrv_sock *old_slp; + struct timeval now; + time_t time_to_wait = nfsrv_sock_idle_timeout; + /* + * Get the oldest tcp socket and calculate the + * earliest time for the next idle timer to fire + * based on the possibly updated nfsrv_sock_idle_timeout + */ + TAILQ_FOREACH(old_slp, &nfsrv_socklist, ns_chain) { + if (old_slp->ns_sotype == SOCK_STREAM) { + microuptime(&now); + time_to_wait -= now.tv_sec - old_slp->ns_timestamp; + if (time_to_wait < 1) + time_to_wait = 1; + break; + } + } + /* + * If we have a timer scheduled, but if its going to fire too late, + * turn it off. + */ + if (nfsrv_idlesock_timer_on > now.tv_sec + time_to_wait) { + thread_call_cancel(nfsrv_idlesock_timer_call); + nfsrv_idlesock_timer_on = 0; + } + /* Schedule the idle thread if it isn't already */ + if (!nfsrv_idlesock_timer_on) { + nfs_interval_timer_start(nfsrv_idlesock_timer_call, time_to_wait * 1000); + nfsrv_idlesock_timer_on = now.tv_sec + time_to_wait; + } + } + } + } + + sock_retain(so); /* grab a retain count on the socket */ + slp->ns_so = so; + slp->ns_sotype = sotype; + slp->ns_nam = mynam; + + /* set up the socket up-call */ + nfsrv_uc_addsock(slp, first); + + /* mark that the socket is not in the nfsrv_sockwg list */ + slp->ns_wgq.tqe_next = SLPNOLIST; + + slp->ns_flag = SLP_VALID | SLP_NEEDQ; nfsrv_wakenfsd(slp); lck_mtx_unlock(nfsd_mutex); @@ -835,88 +985,158 @@ nfssvc_addsock( } /* - * Called by nfssvc() for nfsds. Just loops around servicing rpc requests - * until it is killed by a signal. + * nfssvc_nfsd() + * + * nfsd theory of operation: + * + * The first nfsd thread stays in user mode accepting new TCP connections + * which are then added via the "addsock" call. The rest of the nfsd threads + * simply call into the kernel and remain there in a loop handling NFS + * requests until killed by a signal. + * + * There's a list of nfsd threads (nfsd_head). + * There's an nfsd queue that contains only those nfsds that are + * waiting for work to do (nfsd_queue). + * + * There's a list of all NFS sockets (nfsrv_socklist) and two queues for + * managing the work on the sockets: + * nfsrv_sockwait - sockets w/new data waiting to be worked on + * nfsrv_sockwork - sockets being worked on which may have more work to do + * nfsrv_sockwg -- sockets which have pending write gather data + * When a socket receives data, if it is not currently queued, it + * will be placed at the end of the "wait" queue. + * Whenever a socket needs servicing we make sure it is queued and + * wake up a waiting nfsd (if there is one). + * + * nfsds will service at most 8 requests from the same socket before + * defecting to work on another socket. + * nfsds will defect immediately if there are any sockets in the "wait" queue + * nfsds looking for a socket to work on check the "wait" queue first and + * then check the "work" queue. + * When an nfsd starts working on a socket, it removes it from the head of + * the queue it's currently on and moves it to the end of the "work" queue. + * When nfsds are checking the queues for work, any sockets found not to + * have any work are simply dropped from the queue. + * */ -static int -nfssvc_nfsd(nsd, argp, p) - struct nfsd_srvargs *nsd; - user_addr_t argp; - proc_t p; +int +nfssvc_nfsd(void) { - mbuf_t m, mreq; - struct nfssvc_sock *slp; - struct nfsd *nfsd = nsd->nsd_nfsd; + mbuf_t m, mrep; + struct nfsrv_sock *slp; + struct nfsd *nfsd; struct nfsrv_descript *nd = NULL; int error = 0, cacherep, writes_todo; - int siz, procrastinate; + int siz, procrastinate, opcnt = 0; u_quad_t cur_usec; struct timeval now; - boolean_t funnel_state; + struct vfs_context context; + struct timespec to; #ifndef nolint cacherep = RC_DOIT; writes_todo = 0; #endif - if (nfsd == (struct nfsd *)0) { - MALLOC(nfsd, struct nfsd *, sizeof(struct nfsd), M_NFSD, M_WAITOK); - if (!nfsd) - return (ENOMEM); - nsd->nsd_nfsd = nfsd; - bzero((caddr_t)nfsd, sizeof (struct nfsd)); - nfsd->nfsd_procp = p; - lck_mtx_lock(nfsd_mutex); - TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); - nfs_numnfsd++; - lck_mtx_unlock(nfsd_mutex); - } - funnel_state = thread_funnel_set(kernel_flock, FALSE); + MALLOC(nfsd, struct nfsd *, sizeof(struct nfsd), M_NFSD, M_WAITOK); + if (!nfsd) + return (ENOMEM); + bzero(nfsd, sizeof(struct nfsd)); + lck_mtx_lock(nfsd_mutex); + if (nfsd_thread_count++ == 0) + nfsrv_initcache(); /* Init the server request cache */ + + TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); + lck_mtx_unlock(nfsd_mutex); + + context.vc_thread = current_thread(); + + /* Set time out so that nfsd threads can wake up a see if they are still needed. */ + to.tv_sec = 5; + to.tv_nsec = 0; /* * Loop getting rpc requests until SIGKILL. */ for (;;) { - if ((nfsd->nfsd_flag & NFSD_REQINPROG) == 0) { + if (nfsd_thread_max <= 0) { + /* NFS server shutting down, get out ASAP */ + error = EINTR; + slp = nfsd->nfsd_slp; + } else if (nfsd->nfsd_flag & NFSD_REQINPROG) { + /* already have some work to do */ + error = 0; + slp = nfsd->nfsd_slp; + } else { + /* need to find work to do */ + error = 0; lck_mtx_lock(nfsd_mutex); - while ((nfsd->nfsd_slp == NULL) && !(nfsd_head_flag & NFSD_CHECKSLP)) { + while (!nfsd->nfsd_slp && TAILQ_EMPTY(&nfsrv_sockwait) && TAILQ_EMPTY(&nfsrv_sockwork)) { + if (nfsd_thread_count > nfsd_thread_max) { + /* + * If we have no socket and there are more + * nfsd threads than configured, let's exit. + */ + error = 0; + goto done; + } nfsd->nfsd_flag |= NFSD_WAITING; - nfsd_waiting++; - error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", 0); - nfsd_waiting--; + TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue); + error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to); if (error) { - lck_mtx_unlock(nfsd_mutex); + if (nfsd->nfsd_flag & NFSD_WAITING) { + TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue); + nfsd->nfsd_flag &= ~NFSD_WAITING; + } + if (error == EWOULDBLOCK) + continue; goto done; } } - if ((nfsd->nfsd_slp == NULL) && (nfsd_head_flag & NFSD_CHECKSLP)) { - TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { - lck_rw_lock_shared(&slp->ns_rwlock); - if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) - == (SLP_VALID | SLP_DOREC)) { - if (lck_rw_lock_shared_to_exclusive(&slp->ns_rwlock)) { - /* upgrade failed and we lost the lock; take exclusive and recheck */ - lck_rw_lock_exclusive(&slp->ns_rwlock); - if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) - != (SLP_VALID | SLP_DOREC)) { - /* flags no longer set, so skip this socket */ - lck_rw_done(&slp->ns_rwlock); - continue; - } - } - slp->ns_flag &= ~SLP_DOREC; - slp->ns_sref++; - nfsd->nfsd_slp = slp; - lck_rw_done(&slp->ns_rwlock); - break; - } - lck_rw_done(&slp->ns_rwlock); + slp = nfsd->nfsd_slp; + if (!slp && !TAILQ_EMPTY(&nfsrv_sockwait)) { + /* look for a socket to work on in the wait queue */ + while ((slp = TAILQ_FIRST(&nfsrv_sockwait))) { + lck_rw_lock_exclusive(&slp->ns_rwlock); + /* remove from the head of the queue */ + TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq); + slp->ns_flag &= ~SLP_WAITQ; + if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO)) + break; + /* nothing to do, so skip this socket */ + lck_rw_done(&slp->ns_rwlock); + } + } + if (!slp && !TAILQ_EMPTY(&nfsrv_sockwork)) { + /* look for a socket to work on in the work queue */ + while ((slp = TAILQ_FIRST(&nfsrv_sockwork))) { + lck_rw_lock_exclusive(&slp->ns_rwlock); + /* remove from the head of the queue */ + TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq); + slp->ns_flag &= ~SLP_WORKQ; + if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO)) + break; + /* nothing to do, so skip this socket */ + lck_rw_done(&slp->ns_rwlock); } - if (slp == 0) - nfsd_head_flag &= ~NFSD_CHECKSLP; + } + if (!nfsd->nfsd_slp && slp) { + /* we found a socket to work on, grab a reference */ + slp->ns_sref++; + microuptime(&now); + slp->ns_timestamp = now.tv_sec; + /* We keep the socket list in least recently used order for reaping idle sockets */ + TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain); + TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain); + nfsd->nfsd_slp = slp; + opcnt = 0; + /* and put it at the back of the work queue */ + TAILQ_INSERT_TAIL(&nfsrv_sockwork, slp, ns_svcq); + slp->ns_flag |= SLP_WORKQ; + lck_rw_done(&slp->ns_rwlock); } lck_mtx_unlock(nfsd_mutex); - if ((slp = nfsd->nfsd_slp) == NULL) + if (!slp) continue; lck_rw_lock_exclusive(&slp->ns_rwlock); if (slp->ns_flag & SLP_VALID) { @@ -927,35 +1147,44 @@ nfssvc_nfsd(nsd, argp, p) if (slp->ns_flag & SLP_DISCONN) nfsrv_zapsock(slp); error = nfsrv_dorec(slp, nfsd, &nd); - microuptime(&now); - cur_usec = (u_quad_t)now.tv_sec * 1000000 + - (u_quad_t)now.tv_usec; - if (error && slp->ns_wgtime && (slp->ns_wgtime <= cur_usec)) { - error = 0; - cacherep = RC_DOIT; - writes_todo = 1; - } else - writes_todo = 0; + if (error == EINVAL) { // RPCSEC_GSS drop + if (slp->ns_sotype == SOCK_STREAM) + nfsrv_zapsock(slp); // drop connection + } + writes_todo = 0; + if (error && (slp->ns_wgtime || (slp->ns_flag & SLP_DOWRITES))) { + microuptime(&now); + cur_usec = (u_quad_t)now.tv_sec * 1000000 + + (u_quad_t)now.tv_usec; + if (slp->ns_wgtime <= cur_usec) { + error = 0; + cacherep = RC_DOIT; + writes_todo = 1; + } + slp->ns_flag &= ~SLP_DOWRITES; + } nfsd->nfsd_flag |= NFSD_REQINPROG; } lck_rw_done(&slp->ns_rwlock); - } else { - error = 0; - slp = nfsd->nfsd_slp; } - if (error || (slp->ns_flag & SLP_VALID) == 0) { + if (error || (slp && !(slp->ns_flag & SLP_VALID))) { if (nd) { + nfsm_chain_cleanup(&nd->nd_nmreq); if (nd->nd_nam2) mbuf_freem(nd->nd_nam2); - if (nd->nd_cr) - kauth_cred_rele(nd->nd_cr); - FREE_ZONE((caddr_t)nd, - sizeof *nd, M_NFSRVDESC); + if (IS_VALID_CRED(nd->nd_cr)) + kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); + FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nd = NULL; } nfsd->nfsd_slp = NULL; nfsd->nfsd_flag &= ~NFSD_REQINPROG; - nfsrv_slpderef(slp); + if (slp) + nfsrv_slpderef(slp); + if (nfsd_thread_max <= 0) + break; continue; } if (nd) { @@ -965,86 +1194,89 @@ nfssvc_nfsd(nsd, argp, p) else nd->nd_nam = slp->ns_nam; - /* - * Check to see if authorization is needed. - */ - if (nfsd->nfsd_flag & NFSD_NEEDAUTH) { - nfsd->nfsd_flag &= ~NFSD_NEEDAUTH; - nsd->nsd_haddr = ((struct sockaddr_in *)mbuf_data(nd->nd_nam))->sin_addr.s_addr; - nsd->nsd_authlen = nfsd->nfsd_authlen; - nsd->nsd_verflen = nfsd->nfsd_verflen; - if (!copyout(nfsd->nfsd_authstr,CAST_USER_ADDR_T(nsd->nsd_authstr), - nfsd->nfsd_authlen) && - !copyout(nfsd->nfsd_verfstr, CAST_USER_ADDR_T(nsd->nsd_verfstr), - nfsd->nfsd_verflen) && - !copyout((caddr_t)nsd, argp, sizeof (*nsd))) { - thread_funnel_set(kernel_flock, funnel_state); - return (ENEEDAUTH); - } - cacherep = RC_DROPIT; - } else - cacherep = nfsrv_getcache(nd, slp, &mreq); - - if (nfsd->nfsd_flag & NFSD_AUTHFAIL) { - nfsd->nfsd_flag &= ~NFSD_AUTHFAIL; - nd->nd_procnum = NFSPROC_NOOP; - nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); - cacherep = RC_DOIT; - } else if (nfs_privport) { - /* Check if source port is privileged */ - u_short port; - struct sockaddr *nam = mbuf_data(nd->nd_nam); - struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)nam; - port = ntohs(sin->sin_port); - if (port >= IPPORT_RESERVED && - nd->nd_procnum != NFSPROC_NULL) { - char strbuf[MAX_IPv4_STR_LEN]; + cacherep = nfsrv_getcache(nd, slp, &mrep); + + if (nfsrv_require_resv_port) { + /* Check if source port is a reserved port */ + in_port_t port = 0; + struct sockaddr *saddr = mbuf_data(nd->nd_nam); + + if (saddr->sa_family == AF_INET) + port = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else if (saddr->sa_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) { nd->nd_procnum = NFSPROC_NOOP; nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); cacherep = RC_DOIT; - printf("NFS request from unprivileged port (%s:%d)\n", - inet_ntop(AF_INET, &sin->sin_addr, strbuf, sizeof(strbuf)), - port); } } } /* - * Loop to get all the write rpc relies that have been + * Loop to get all the write RPC replies that have been * gathered together. */ do { switch (cacherep) { case RC_DOIT: - if (nd && (nd->nd_flag & ND_NFSV3)) - procrastinate = nfsrvw_procrastinate_v3; + if (nd && (nd->nd_vers == NFS_VER3)) + procrastinate = nfsrv_wg_delay_v3; else - procrastinate = nfsrvw_procrastinate; - lck_rw_lock_shared(&nfs_export_rwlock); + procrastinate = nfsrv_wg_delay; + lck_rw_lock_shared(&nfsrv_export_rwlock); + context.vc_ucred = NULL; if (writes_todo || ((nd->nd_procnum == NFSPROC_WRITE) && (procrastinate > 0))) - error = nfsrv_writegather(&nd, slp, nfsd->nfsd_procp, &mreq); + error = nfsrv_writegather(&nd, slp, &context, &mrep); else - error = (*(nfsrv3_procs[nd->nd_procnum]))(nd, slp, nfsd->nfsd_procp, &mreq); - lck_rw_done(&nfs_export_rwlock); - if (mreq == NULL) + error = (*(nfsrv_procs[nd->nd_procnum]))(nd, slp, &context, &mrep); + lck_rw_done(&nfsrv_export_rwlock); + if (mrep == NULL) { + /* + * If this is a stream socket and we are not going + * to send a reply we better close the connection + * so the client doesn't hang. + */ + if (error && slp->ns_sotype == SOCK_STREAM) { + lck_rw_lock_exclusive(&slp->ns_rwlock); + nfsrv_zapsock(slp); + lck_rw_done(&slp->ns_rwlock); + printf("NFS server: NULL reply from proc = %d error = %d\n", + nd->nd_procnum, error); + } break; + + } if (error) { - OSAddAtomic(1, (SInt32*)&nfsstats.srv_errs); - nfsrv_updatecache(nd, FALSE, mreq); + OSAddAtomic64(1, &nfsstats.srv_errs); + nfsrv_updatecache(nd, FALSE, mrep); if (nd->nd_nam2) { mbuf_freem(nd->nd_nam2); nd->nd_nam2 = NULL; } break; } - OSAddAtomic(1, (SInt32*)&nfsstats.srvrpccnt[nd->nd_procnum]); - nfsrv_updatecache(nd, TRUE, mreq); - nd->nd_mrep = NULL; + OSAddAtomic64(1, &nfsstats.srvrpccnt[nd->nd_procnum]); + nfsrv_updatecache(nd, TRUE, mrep); + /* FALLTHRU */ + case RC_REPLY: - m = mreq; + if (nd->nd_gss_mb != NULL) { // It's RPCSEC_GSS + /* + * Need to checksum or encrypt the reply + */ + error = nfs_gss_svc_protect_reply(nd, mrep); + if (error) { + mbuf_freem(mrep); + break; + } + } + + /* + * Get the total size of the reply + */ + m = mrep; siz = 0; while (m) { siz += mbuf_len(m); @@ -1054,7 +1286,7 @@ nfssvc_nfsd(nsd, argp, p) printf("mbuf siz=%d\n",siz); panic("Bad nfs svc reply"); } - m = mreq; + m = mrep; mbuf_pkthdr_setlen(m, siz); error = mbuf_pkthdr_setrcvif(m, NULL); if (error) @@ -1066,11 +1298,11 @@ nfssvc_nfsd(nsd, argp, p) if (slp->ns_sotype == SOCK_STREAM) { error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK); if (!error) - *(u_long*)mbuf_data(m) = htonl(0x80000000 | siz); + *(u_int32_t*)mbuf_data(m) = htonl(0x80000000 | siz); } if (!error) { if (slp->ns_flag & SLP_VALID) { - error = nfs_send(slp->ns_so, nd->nd_nam2, m, NULL); + error = nfsrv_send(slp, nd->nd_nam2, m); } else { error = EPIPE; mbuf_freem(m); @@ -1078,46 +1310,43 @@ nfssvc_nfsd(nsd, argp, p) } else { mbuf_freem(m); } - mreq = NULL; - if (nfsrtton) - nfsd_rt(slp->ns_sotype, nd, cacherep); + mrep = NULL; if (nd->nd_nam2) { mbuf_freem(nd->nd_nam2); nd->nd_nam2 = NULL; } - if (nd->nd_mrep) { - mbuf_freem(nd->nd_mrep); - nd->nd_mrep = NULL; - } if (error == EPIPE) { lck_rw_lock_exclusive(&slp->ns_rwlock); nfsrv_zapsock(slp); lck_rw_done(&slp->ns_rwlock); } if (error == EINTR || error == ERESTART) { - if (nd->nd_cr) - kauth_cred_rele(nd->nd_cr); - FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); + nfsm_chain_cleanup(&nd->nd_nmreq); + if (IS_VALID_CRED(nd->nd_cr)) + kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); + FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nfsrv_slpderef(slp); + lck_mtx_lock(nfsd_mutex); goto done; } break; case RC_DROPIT: - if (nfsrtton) - nfsd_rt(slp->ns_sotype, nd, cacherep); - mbuf_freem(nd->nd_mrep); mbuf_freem(nd->nd_nam2); - nd->nd_mrep = nd->nd_nam2 = NULL; + nd->nd_nam2 = NULL; break; }; + opcnt++; if (nd) { - if (nd->nd_mrep) - mbuf_freem(nd->nd_mrep); + nfsm_chain_cleanup(&nd->nd_nmreq); if (nd->nd_nam2) mbuf_freem(nd->nd_nam2); - if (nd->nd_cr) - kauth_cred_rele(nd->nd_cr); - FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); + if (IS_VALID_CRED(nd->nd_cr)) + kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); + FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nd = NULL; } @@ -1125,48 +1354,53 @@ nfssvc_nfsd(nsd, argp, p) * Check to see if there are outstanding writes that * need to be serviced. */ - microuptime(&now); - cur_usec = (u_quad_t)now.tv_sec * 1000000 + - (u_quad_t)now.tv_usec; - if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec)) { - cacherep = RC_DOIT; - writes_todo = 1; - } else { - writes_todo = 0; + writes_todo = 0; + if (slp->ns_wgtime) { + microuptime(&now); + cur_usec = (u_quad_t)now.tv_sec * 1000000 + + (u_quad_t)now.tv_usec; + if (slp->ns_wgtime <= cur_usec) { + cacherep = RC_DOIT; + writes_todo = 1; + } } } while (writes_todo); - lck_rw_lock_exclusive(&slp->ns_rwlock); - if (nfsrv_dorec(slp, nfsd, &nd)) { + + nd = NULL; + if (TAILQ_EMPTY(&nfsrv_sockwait) && (opcnt < 8)) { + lck_rw_lock_exclusive(&slp->ns_rwlock); + error = nfsrv_dorec(slp, nfsd, &nd); + if (error == EINVAL) { // RPCSEC_GSS drop + if (slp->ns_sotype == SOCK_STREAM) + nfsrv_zapsock(slp); // drop connection + } lck_rw_done(&slp->ns_rwlock); + } + if (!nd) { + /* drop our reference on the socket */ nfsd->nfsd_flag &= ~NFSD_REQINPROG; nfsd->nfsd_slp = NULL; nfsrv_slpderef(slp); - } else { - lck_rw_done(&slp->ns_rwlock); } } -done: - thread_funnel_set(kernel_flock, funnel_state); lck_mtx_lock(nfsd_mutex); +done: TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain); FREE(nfsd, M_NFSD); - nsd->nsd_nfsd = (struct nfsd *)0; - if (--nfs_numnfsd == 0) - nfsrv_init(TRUE); /* Reinitialize everything */ + if (--nfsd_thread_count == 0) + nfsrv_cleanup(); lck_mtx_unlock(nfsd_mutex); return (error); } -static int -nfssvc_export(user_addr_t argp, proc_t p) +int +nfssvc_export(user_addr_t argp) { int error = 0, is_64bit; struct user_nfs_export_args unxa; - struct vfs_context context; + vfs_context_t ctx = vfs_context_current(); - context.vc_proc = p; - context.vc_ucred = kauth_cred_get(); - is_64bit = IS_64BIT_PROCESS(p); + is_64bit = IS_64BIT_PROCESS(vfs_context_proc(ctx)); /* copy in pointers to path and export args */ if (is_64bit) { @@ -1188,204 +1422,20 @@ nfssvc_export(user_addr_t argp, proc_t p) if (error) return (error); - error = nfsrv_export(&unxa, &context); + error = nfsrv_export(&unxa, ctx); return (error); } -#endif /* NFS_NOSERVER */ - -int nfs_defect = 0; -/* XXX CSM 11/25/97 Upgrade sysctl.h someday */ -#ifdef notyet -SYSCTL_INT(_vfs_nfs, OID_AUTO, defect, CTLFLAG_RW, &nfs_defect, 0, ""); -#endif - -int -nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) -{ - struct lockd_ans la; - int error; - - if (uap->flag == NFSCLNT_LOCKDWAIT) { - return (nfslockdwait(p)); - } - if (uap->flag == NFSCLNT_LOCKDANS) { - error = copyin(uap->argp, &la, sizeof(la)); - return (error != 0 ? error : nfslockdans(p, &la)); - } - if (uap->flag == NFSCLNT_LOCKDFD) - return (nfslockdfd(p, CAST_DOWN(int, uap->argp))); - return EINVAL; -} - - -static int nfssvc_iod_continue(int); - -/* - * Asynchronous I/O daemons for client nfs. - * They do read-ahead and write-behind operations on the block I/O cache. - * Never returns unless it fails or gets killed. - */ -static int -nfssvc_iod(__unused proc_t p) -{ - register int i, myiod; - struct uthread *ut; - - /* - * Assign my position or return error if too many already running - */ - myiod = -1; - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_asyncdaemon[i] == 0) { - nfs_asyncdaemon[i]++; - myiod = i; - break; - } - if (myiod == -1) - return (EBUSY); - nfs_numasync++; - - /* stuff myiod into uthread to get off local stack for continuation */ - - ut = (struct uthread *)get_bsdthread_info(current_thread()); - ut->uu_state.uu_nfs_myiod = myiod; /* squirrel away for continuation */ - - nfssvc_iod_continue(0); - /* NOTREACHED */ - return (0); -} - -/* - * Continuation for Asynchronous I/O daemons for client nfs. - */ -static int -nfssvc_iod_continue(int error) -{ - register struct nfsbuf *bp; - register int i, myiod; - struct nfsmount *nmp; - struct uthread *ut; - proc_t p; - - /* - * real myiod is stored in uthread, recover it - */ - ut = (struct uthread *)get_bsdthread_info(current_thread()); - myiod = ut->uu_state.uu_nfs_myiod; - p = current_proc(); // XXX - - /* - * Just loop around doin our stuff until SIGKILL - * - actually we don't loop with continuations... - */ - lck_mtx_lock(nfs_iod_mutex); - for (;;) { - while (((nmp = nfs_iodmount[myiod]) == NULL - || nmp->nm_bufq.tqh_first == NULL) - && error == 0 && nfs_ioddelwri == 0) { - if (nmp) - nmp->nm_bufqiods--; - nfs_iodwant[myiod] = p; // XXX this doesn't need to be a proc_t - nfs_iodmount[myiod] = NULL; - error = msleep0((caddr_t)&nfs_iodwant[myiod], nfs_iod_mutex, - PWAIT | PCATCH | PDROP, "nfsidl", 0, nfssvc_iod_continue); - lck_mtx_lock(nfs_iod_mutex); - } - if (error) { - nfs_asyncdaemon[myiod] = 0; - if (nmp) nmp->nm_bufqiods--; - nfs_iodwant[myiod] = NULL; - nfs_iodmount[myiod] = NULL; - lck_mtx_unlock(nfs_iod_mutex); - nfs_numasync--; - if (error == EINTR || error == ERESTART) - error = 0; - unix_syscall_return(error); - } - if (nmp != NULL) { - while ((bp = TAILQ_FIRST(&nmp->nm_bufq)) != NULL) { - /* Take one off the front of the list */ - TAILQ_REMOVE(&nmp->nm_bufq, bp, nb_free); - bp->nb_free.tqe_next = NFSNOLIST; - nmp->nm_bufqlen--; - if (nmp->nm_bufqwant && nmp->nm_bufqlen < 2 * nfs_numasync) { - nmp->nm_bufqwant = FALSE; - lck_mtx_unlock(nfs_iod_mutex); - wakeup(&nmp->nm_bufq); - } else { - lck_mtx_unlock(nfs_iod_mutex); - } - - SET(bp->nb_flags, NB_IOD); - if (ISSET(bp->nb_flags, NB_READ)) - nfs_doio(bp, bp->nb_rcred, NULL); - else - nfs_doio(bp, bp->nb_wcred, NULL); - - lck_mtx_lock(nfs_iod_mutex); - /* - * If there are more than one iod on this mount, then defect - * so that the iods can be shared out fairly between the mounts - */ - if (nfs_defect && nmp->nm_bufqiods > 1) { - nfs_iodmount[myiod] = NULL; - nmp->nm_bufqiods--; - break; - } - } - } - lck_mtx_unlock(nfs_iod_mutex); - - if (nfs_ioddelwri) { - i = 0; - nfs_ioddelwri = 0; - lck_mtx_lock(nfs_buf_mutex); - while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { - struct nfsnode *np = VTONFS(bp->nb_vp); - nfs_buf_remfree(bp); - nfs_buf_refget(bp); - while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); - nfs_buf_refrele(bp); - if (error) - break; - if (!bp->nb_vp) { - /* buffer is no longer valid */ - nfs_buf_drop(bp); - continue; - } - if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { - /* put buffer at end of delwri list */ - TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); - nfsbufdelwricnt++; - nfs_buf_drop(bp); - lck_mtx_unlock(nfs_buf_mutex); - nfs_flushcommits(np->n_vnode, NULL, 1); - } else { - SET(bp->nb_flags, (NB_ASYNC | NB_IOD)); - lck_mtx_unlock(nfs_buf_mutex); - nfs_buf_write(bp); - } - i++; - lck_mtx_lock(nfs_buf_mutex); - } - lck_mtx_unlock(nfs_buf_mutex); - } - - lck_mtx_lock(nfs_iod_mutex); - } -} - /* - * Shut down a socket associated with an nfssvc_sock structure. + * Shut down a socket associated with an nfsrv_sock structure. * Should be called with the send lock set, if required. * The trick here is to increment the sref at the start, so that the nfsds * will stop using it and clear ns_flag at the end so that it will not be * reassigned during cleanup. */ -static void -nfsrv_zapsock(struct nfssvc_sock *slp) +void +nfsrv_zapsock(struct nfsrv_sock *slp) { socket_t so; @@ -1397,248 +1447,21 @@ nfsrv_zapsock(struct nfssvc_sock *slp) if (so == NULL) return; - /* - * Attempt to deter future upcalls, but leave the - * upcall info in place to avoid a race with the - * networking code. - */ - socket_lock(so, 1); - so->so_rcv.sb_flags &= ~SB_UPCALL; - socket_unlock(so, 1); - + sock_setupcall(so, NULL, NULL); sock_shutdown(so, SHUT_RDWR); -} - -/* - * Get an authorization string for the uid by having the mount_nfs sitting - * on this mount point porpous out of the kernel and do it. - */ -int -nfs_getauth(nmp, rep, cred, auth_str, auth_len, verf_str, verf_len, key) - register struct nfsmount *nmp; - struct nfsreq *rep; - kauth_cred_t cred; - char **auth_str; - int *auth_len; - char *verf_str; - int *verf_len; - NFSKERBKEY_T key; /* return session key */ -{ - int error = 0; - - while ((nmp->nm_state & NFSSTA_WAITAUTH) == 0) { - nmp->nm_state |= NFSSTA_WANTAUTH; - (void) tsleep((caddr_t)&nmp->nm_authtype, PSOCK, - "nfsauth1", 2 * hz); - error = nfs_sigintr(nmp, rep, rep->r_procp); - if (error) { - nmp->nm_state &= ~NFSSTA_WANTAUTH; - return (error); - } - } - nmp->nm_state &= ~NFSSTA_WANTAUTH; - MALLOC(*auth_str, char *, RPCAUTH_MAXSIZ, M_TEMP, M_WAITOK); - if (!*auth_str) - return (ENOMEM); - nmp->nm_authstr = *auth_str; - nmp->nm_authlen = RPCAUTH_MAXSIZ; - nmp->nm_verfstr = verf_str; - nmp->nm_verflen = *verf_len; - nmp->nm_authuid = kauth_cred_getuid(cred); - nmp->nm_state &= ~NFSSTA_WAITAUTH; - wakeup((caddr_t)&nmp->nm_authstr); - - /* - * And wait for mount_nfs to do its stuff. - */ - while ((nmp->nm_state & NFSSTA_HASAUTH) == 0 && error == 0) { - (void) tsleep((caddr_t)&nmp->nm_authlen, PSOCK, - "nfsauth2", 2 * hz); - error = nfs_sigintr(nmp, rep, rep->r_procp); - } - if (nmp->nm_state & NFSSTA_AUTHERR) { - nmp->nm_state &= ~NFSSTA_AUTHERR; - error = EAUTH; - } - if (error) - FREE(*auth_str, M_TEMP); - else { - *auth_len = nmp->nm_authlen; - *verf_len = nmp->nm_verflen; - bcopy((caddr_t)nmp->nm_key, (caddr_t)key, sizeof (key)); - } - nmp->nm_state &= ~NFSSTA_HASAUTH; - nmp->nm_state |= NFSSTA_WAITAUTH; - if (nmp->nm_state & NFSSTA_WANTAUTH) { - nmp->nm_state &= ~NFSSTA_WANTAUTH; - wakeup((caddr_t)&nmp->nm_authtype); - } - return (error); -} - -/* - * Get a nickname authenticator and verifier. - */ -int -nfs_getnickauth( - struct nfsmount *nmp, - kauth_cred_t cred, - char **auth_str, - int *auth_len, - char *verf_str, - __unused int verf_len) -{ - register struct nfsuid *nuidp; - register u_long *nickp, *verfp; - struct timeval ktvin, ktvout, now; - -#if DIAGNOSTIC - if (verf_len < (4 * NFSX_UNSIGNED)) - panic("nfs_getnickauth verf too small"); -#endif - for (nuidp = NMUIDHASH(nmp, kauth_cred_getuid(cred))->lh_first; - nuidp != 0; nuidp = nuidp->nu_hash.le_next) { - if (kauth_cred_getuid(nuidp->nu_cr) == kauth_cred_getuid(cred)) - break; - } - microtime(&now); - if (!nuidp || nuidp->nu_expire < now.tv_sec) - return (EACCES); - - MALLOC(nickp, u_long *, 2 * NFSX_UNSIGNED, M_TEMP, M_WAITOK); - if (!nickp) - return (ENOMEM); - - /* - * Move to the end of the lru list (end of lru == most recently used). - */ - TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); - TAILQ_INSERT_TAIL(&nmp->nm_uidlruhead, nuidp, nu_lru); - - *nickp++ = txdr_unsigned(RPCAKN_NICKNAME); - *nickp = txdr_unsigned(nuidp->nu_nickname); - *auth_str = (char *)nickp; - *auth_len = 2 * NFSX_UNSIGNED; - - /* - * Now we must encrypt the verifier and package it up. - */ - verfp = (u_long *)verf_str; - *verfp++ = txdr_unsigned(RPCAKN_NICKNAME); - microtime(&now); - if (now.tv_sec > nuidp->nu_timestamp.tv_sec || - (now.tv_sec == nuidp->nu_timestamp.tv_sec && - now.tv_usec > nuidp->nu_timestamp.tv_usec)) - nuidp->nu_timestamp = now; - else - nuidp->nu_timestamp.tv_usec++; - ktvin.tv_sec = txdr_unsigned(nuidp->nu_timestamp.tv_sec); - ktvin.tv_usec = txdr_unsigned(nuidp->nu_timestamp.tv_usec); /* - * Now encrypt the timestamp verifier in ecb mode using the session - * key. + * Remove from the up-call queue */ -#if NFSKERB - XXX -#endif - - *verfp++ = ktvout.tv_sec; - *verfp++ = ktvout.tv_usec; - *verfp = 0; - return (0); + nfsrv_uc_dequeue(slp); } -/* - * Save the current nickname in a hash list entry on the mount point. - */ -int -nfs_savenickauth(nmp, cred, len, key, mdp, dposp, mrep) - register struct nfsmount *nmp; - kauth_cred_t cred; - int len; - NFSKERBKEY_T key; - mbuf_t *mdp; - char **dposp; - mbuf_t mrep; -{ - register struct nfsuid *nuidp; - register u_long *tl; - register long t1; - mbuf_t md = *mdp; - struct timeval ktvin, ktvout, now; - u_long nick; - char *dpos = *dposp, *cp2; - int deltasec, error = 0; - - if (len == (3 * NFSX_UNSIGNED)) { - nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); - ktvin.tv_sec = *tl++; - ktvin.tv_usec = *tl++; - nick = fxdr_unsigned(u_long, *tl); - - /* - * Decrypt the timestamp in ecb mode. - */ -#if NFSKERB - XXX -#endif - ktvout.tv_sec = fxdr_unsigned(long, ktvout.tv_sec); - ktvout.tv_usec = fxdr_unsigned(long, ktvout.tv_usec); - microtime(&now); - deltasec = now.tv_sec - ktvout.tv_sec; - if (deltasec < 0) - deltasec = -deltasec; - /* - * If ok, add it to the hash list for the mount point. - */ - if (deltasec <= NFS_KERBCLOCKSKEW) { - if (nmp->nm_numuids < nuidhash_max) { - nmp->nm_numuids++; - MALLOC_ZONE(nuidp, struct nfsuid *, - sizeof (struct nfsuid), - M_NFSUID, M_WAITOK); - } else { - nuidp = NULL; - } - if (!nuidp) { - nuidp = nmp->nm_uidlruhead.tqh_first; - if (!nuidp) { - error = ENOMEM; - goto nfsmout; - } - LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); - kauth_cred_rele(nuidp->nu_cr); - } - nuidp->nu_flag = 0; - kauth_cred_ref(cred); - nuidp->nu_cr = cred; - nuidp->nu_expire = now.tv_sec + NFS_KERBTTL; - nuidp->nu_timestamp = ktvout; - nuidp->nu_nickname = nick; - bcopy(key, nuidp->nu_key, sizeof (key)); - TAILQ_INSERT_TAIL(&nmp->nm_uidlruhead, nuidp, nu_lru); - LIST_INSERT_HEAD(NMUIDHASH(nmp, kauth_cred_getuid(cred)), - nuidp, nu_hash); - } - } else - nfsm_adv(nfsm_rndup(len)); -nfsmout: - *mdp = md; - *dposp = dpos; - return (error); -} - -#ifndef NFS_NOSERVER - /* * cleanup and release a server socket structure. */ void -nfsrv_slpfree(struct nfssvc_sock *slp) +nfsrv_slpfree(struct nfsrv_sock *slp) { - struct nfsuid *nuidp, *nnuidp; struct nfsrv_descript *nwp, *nnwp; if (slp->ns_so) { @@ -1651,31 +1474,29 @@ nfsrv_slpfree(struct nfssvc_sock *slp) mbuf_freem(slp->ns_raw); if (slp->ns_rec) mbuf_freem(slp->ns_rec); - slp->ns_nam = slp->ns_raw = slp->ns_rec = NULL; - - for (nuidp = slp->ns_uidlruhead.tqh_first; nuidp != 0; - nuidp = nnuidp) { - nnuidp = nuidp->nu_lru.tqe_next; - LIST_REMOVE(nuidp, nu_hash); - TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru); - if (nuidp->nu_flag & NU_NAM) - mbuf_freem(nuidp->nu_nam); - kauth_cred_rele(nuidp->nu_cr); - FREE_ZONE((caddr_t)nuidp, - sizeof (struct nfsuid), M_NFSUID); - } + if (slp->ns_frag) + mbuf_freem(slp->ns_frag); + slp->ns_nam = slp->ns_raw = slp->ns_rec = slp->ns_frag = NULL; + slp->ns_reccnt = 0; for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) { nnwp = nwp->nd_tq.le_next; LIST_REMOVE(nwp, nd_tq); - if (nwp->nd_cr) - kauth_cred_rele(nwp->nd_cr); - FREE_ZONE((caddr_t)nwp, sizeof *nwp, M_NFSRVDESC); + nfsm_chain_cleanup(&nwp->nd_nmreq); + if (nwp->nd_mrep) + mbuf_freem(nwp->nd_mrep); + if (nwp->nd_nam2) + mbuf_freem(nwp->nd_nam2); + if (IS_VALID_CRED(nwp->nd_cr)) + kauth_cred_unref(&nwp->nd_cr); + if (nwp->nd_gss_context) + nfs_gss_svc_ctx_deref(nwp->nd_gss_context); + FREE_ZONE(nwp, sizeof(*nwp), M_NFSRVDESC); } LIST_INIT(&slp->ns_tq); - lck_rw_destroy(&slp->ns_rwlock, nfs_slp_rwlock_group); - lck_mtx_destroy(&slp->ns_wgmutex, nfs_slp_mutex_group); + lck_rw_destroy(&slp->ns_rwlock, nfsrv_slp_rwlock_group); + lck_mtx_destroy(&slp->ns_wgmutex, nfsrv_slp_mutex_group); FREE(slp, M_NFSSVC); } @@ -1683,143 +1504,178 @@ nfsrv_slpfree(struct nfssvc_sock *slp) * Derefence a server socket structure. If it has no more references and * is no longer valid, you can throw it away. */ -void -nfsrv_slpderef(struct nfssvc_sock *slp) +static void +nfsrv_slpderef_locked(struct nfsrv_sock *slp) { - struct timeval now; - - lck_mtx_lock(nfsd_mutex); lck_rw_lock_exclusive(&slp->ns_rwlock); slp->ns_sref--; + if (slp->ns_sref || (slp->ns_flag & SLP_VALID)) { + if ((slp->ns_flag & SLP_QUEUED) && !(slp->ns_flag & SLP_WORKTODO)) { + /* remove socket from queue since there's no work */ + if (slp->ns_flag & SLP_WAITQ) + TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq); + else + TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq); + slp->ns_flag &= ~SLP_QUEUED; + } lck_rw_done(&slp->ns_rwlock); - lck_mtx_unlock(nfsd_mutex); return; } - /* queue the socket up for deletion */ - microuptime(&now); - slp->ns_timestamp = now.tv_sec; - TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); - TAILQ_INSERT_TAIL(&nfssvc_deadsockhead, slp, ns_chain); + /* This socket is no longer valid, so we'll get rid of it */ + + if (slp->ns_flag & SLP_QUEUED) { + if (slp->ns_flag & SLP_WAITQ) + TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq); + else + TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq); + slp->ns_flag &= ~SLP_QUEUED; + } lck_rw_done(&slp->ns_rwlock); - if (slp == nfs_udpsock) - nfs_udpsock = NULL; -#if ISO - else if (slp == nfs_cltpsock) - nfs_cltpsock = NULL; -#endif - lck_mtx_unlock(nfsd_mutex); + + TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain); + if (slp->ns_sotype == SOCK_STREAM) + nfsrv_sock_tcp_cnt--; + + /* now remove from the write gather socket list */ + if (slp->ns_wgq.tqe_next != SLPNOLIST) { + TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq); + slp->ns_wgq.tqe_next = SLPNOLIST; + } + nfsrv_slpfree(slp); } +void +nfsrv_slpderef(struct nfsrv_sock *slp) +{ + lck_mtx_lock(nfsd_mutex); + nfsrv_slpderef_locked(slp); + lck_mtx_unlock(nfsd_mutex); +} /* - * Initialize the data structures for the server. - * Handshake with any new nfsds starting up to avoid any chance of - * corruption. + * Check periodically for idle sockest if needed and + * zap them. */ void -nfsrv_init(terminating) - int terminating; +nfsrv_idlesock_timer(__unused void *param0, __unused void *param1) { - struct nfssvc_sock *slp, *nslp; + struct nfsrv_sock *slp, *tslp; struct timeval now; + time_t time_to_wait = nfsrv_sock_idle_timeout; - if (terminating) { - microuptime(&now); - for (slp = TAILQ_FIRST(&nfssvc_sockhead); slp != 0; slp = nslp) { - nslp = TAILQ_NEXT(slp, ns_chain); - if (slp->ns_flag & SLP_VALID) { - lck_rw_lock_exclusive(&slp->ns_rwlock); - nfsrv_zapsock(slp); - lck_rw_done(&slp->ns_rwlock); - } - /* queue the socket up for deletion */ - slp->ns_timestamp = now.tv_sec; - TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain); - TAILQ_INSERT_TAIL(&nfssvc_deadsockhead, slp, ns_chain); - if (slp == nfs_udpsock) - nfs_udpsock = NULL; -#if ISO - else if (slp == nfs_cltpsock) - nfs_cltpsock = NULL; -#endif - } - nfsrv_cleancache(); /* And clear out server cache */ -/* XXX Revisit when enabling WebNFS */ -#ifdef WEBNFS_ENABLED - } else - nfs_pub.np_valid = 0; -#else - } -#endif + microuptime(&now); + lck_mtx_lock(nfsd_mutex); - if (!terminating) { - TAILQ_INIT(&nfssvc_sockhead); - TAILQ_INIT(&nfssvc_deadsockhead); - TAILQ_INIT(&nfsd_head); - nfsd_head_flag &= ~NFSD_CHECKSLP; + /* Turn off the timer if we're suppose to and get out */ + if (nfsrv_sock_idle_timeout < NFSD_MIN_IDLE_TIMEOUT) + nfsrv_sock_idle_timeout = 0; + if ((nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) || (nfsrv_sock_idle_timeout == 0)) { + nfsrv_idlesock_timer_on = 0; + lck_mtx_unlock(nfsd_mutex); + return; } - MALLOC(nfs_udpsock, struct nfssvc_sock *, sizeof(struct nfssvc_sock), - M_NFSSVC, M_WAITOK); - if (nfs_udpsock) { - bzero((caddr_t)nfs_udpsock, sizeof (struct nfssvc_sock)); - lck_rw_init(&nfs_udpsock->ns_rwlock, nfs_slp_rwlock_group, nfs_slp_lock_attr); - lck_mtx_init(&nfs_udpsock->ns_wgmutex, nfs_slp_mutex_group, nfs_slp_lock_attr); - TAILQ_INIT(&nfs_udpsock->ns_uidlruhead); - TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain); - } else { - printf("nfsrv_init() failed to allocate UDP socket\n"); + TAILQ_FOREACH_SAFE(slp, &nfsrv_socklist, ns_chain, tslp) { + lck_rw_lock_exclusive(&slp->ns_rwlock); + /* Skip udp and referenced sockets */ + if (slp->ns_sotype == SOCK_DGRAM || slp->ns_sref) { + lck_rw_done(&slp->ns_rwlock); + continue; + } + /* + * If this is the first non-referenced socket that hasn't idle out, + * use its time stamp to calculate the earlist time in the future + * to start the next invocation of the timer. Since the nfsrv_socklist + * is sorted oldest access to newest. Once we find the first one, + * we're done and break out of the loop. + */ + if (((slp->ns_timestamp + nfsrv_sock_idle_timeout) > now.tv_sec) || + nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) { + time_to_wait -= now.tv_sec - slp->ns_timestamp; + if (time_to_wait < 1) + time_to_wait = 1; + lck_rw_done(&slp->ns_rwlock); + break; + } + /* + * Bump the ref count. nfsrv_slpderef below will destroy + * the socket, since nfsrv_zapsock has closed it. + */ + slp->ns_sref++; + nfsrv_zapsock(slp); + lck_rw_done(&slp->ns_rwlock); + nfsrv_slpderef_locked(slp); } -#if ISO - MALLOC(nfs_cltpsock, struct nfssvc_sock *, sizeof(struct nfssvc_sock), - M_NFSSVC, M_WAITOK); - if (nfs_cltpsock) { - bzero((caddr_t)nfs_cltpsock, sizeof (struct nfssvc_sock)); - lck_rw_init(&nfs_cltpsock->ns_rwlock, nfs_slp_rwlock_group, nfs_slp_lock_attr); - lck_mtx_init(&nfs_cltpsock->ns_wgmutex, nfs_slp_mutex_group, nfs_slp_lock_attr); - TAILQ_INIT(&nfs_cltpsock->ns_uidlruhead); - TAILQ_INSERT_TAIL(&nfssvc_sockhead, nfs_cltpsock, ns_chain); - } else { - printf("nfsrv_init() failed to allocate CLTP socket\n"); - } -#endif + /* Start ourself back up */ + nfs_interval_timer_start(nfsrv_idlesock_timer_call, time_to_wait * 1000); + /* Remember when the next timer will fire for nfssvc_addsock. */ + nfsrv_idlesock_timer_on = now.tv_sec + time_to_wait; + lck_mtx_unlock(nfsd_mutex); } /* - * Add entries to the server monitor log. + * Clean up the data structures for the server. */ -static void -nfsd_rt(sotype, nd, cacherep) - int sotype; - register struct nfsrv_descript *nd; - int cacherep; +void +nfsrv_cleanup(void) { - register struct drt *rt; + struct nfsrv_sock *slp, *nslp; struct timeval now; +#if CONFIG_FSE + struct nfsrv_fmod *fp, *nfp; + int i; +#endif - rt = &nfsdrt.drt[nfsdrt.pos]; - if (cacherep == RC_DOIT) - rt->flag = 0; - else if (cacherep == RC_REPLY) - rt->flag = DRT_CACHEREPLY; - else - rt->flag = DRT_CACHEDROP; - if (sotype == SOCK_STREAM) - rt->flag |= DRT_TCP; - else if (nd->nd_flag & ND_NFSV3) - rt->flag |= DRT_NFSV3; - rt->proc = nd->nd_procnum; - if (((struct sockaddr *)mbuf_data(nd->nd_nam))->sa_family == AF_INET) - rt->ipadr = ((struct sockaddr_in *)mbuf_data(nd->nd_nam))->sin_addr.s_addr; - else - rt->ipadr = INADDR_ANY; microuptime(&now); - rt->resptime = ((now.tv_sec - nd->nd_starttime.tv_sec) * 1000000) + - (now.tv_usec - nd->nd_starttime.tv_usec); - microtime(&rt->tstamp); // XXX unused - nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ; + for (slp = TAILQ_FIRST(&nfsrv_socklist); slp != 0; slp = nslp) { + nslp = TAILQ_NEXT(slp, ns_chain); + lck_rw_lock_exclusive(&slp->ns_rwlock); + slp->ns_sref++; + if (slp->ns_flag & SLP_VALID) + nfsrv_zapsock(slp); + lck_rw_done(&slp->ns_rwlock); + nfsrv_slpderef_locked(slp); + } +# +#if CONFIG_FSE + /* + * Flush pending file write fsevents + */ + lck_mtx_lock(nfsrv_fmod_mutex); + for (i = 0; i < NFSRVFMODHASHSZ; i++) { + for (fp = LIST_FIRST(&nfsrv_fmod_hashtbl[i]); fp; fp = nfp) { + /* + * Fire off the content modified fsevent for each + * entry, remove it from the list, and free it. + */ + if (nfsrv_fsevents_enabled) { + fp->fm_context.vc_thread = current_thread(); + add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context, + FSE_ARG_VNODE, fp->fm_vp, + FSE_ARG_DONE); + } + vnode_put(fp->fm_vp); + kauth_cred_unref(&fp->fm_context.vc_ucred); + nfp = LIST_NEXT(fp, fm_link); + LIST_REMOVE(fp, fm_link); + FREE(fp, M_TEMP); + } + } + nfsrv_fmod_pending = 0; + lck_mtx_unlock(nfsrv_fmod_mutex); +#endif + + nfsrv_uc_cleanup(); /* Stop nfs socket up-call threads */ + + nfs_gss_svc_cleanup(); /* Remove any RPCSEC_GSS contexts */ + + nfsrv_cleancache(); /* And clear out server cache */ + + nfsrv_udpsock = NULL; + nfsrv_udp6sock = NULL; } + #endif /* NFS_NOSERVER */