X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/c0fea4742e91338fffdcf79f86a7c1d5e2b97eb1..eee3565979933af707c711411001ba11fe406a3c:/bsd/nfs/nfsnode.h diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index ea95a3a07..de4913f33 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -69,32 +75,17 @@ #ifndef _NFS_NFS_H_ #include #endif +#include /* * Silly rename structure that hangs off the nfsnode until the name - * can be removed by nfs_inactive() + * can be removed by nfs_vnop_inactive() */ -struct sillyrename { - struct ucred *s_cred; - vnode_t s_dvp; - long s_namlen; - char s_name[20]; -}; - -/* - * This structure is used to save the logical directory offset to - * NFS cookie mappings. - * The mappings are stored in a list headed - * by n_cookies, as required. - * There is one mapping for each NFS_DIRBLKSIZ bytes of directory information - * stored in increasing logical offset byte order. - */ -#define NFSNUMCOOKIES 31 - -struct nfsdmap { - LIST_ENTRY(nfsdmap) ndm_list; - int ndm_eocookie; - nfsuint64 ndm_cookies[NFSNUMCOOKIES]; +struct nfs_sillyrename { + kauth_cred_t nsr_cred; + struct nfsnode *nsr_dnp; + int nsr_namlen; + char nsr_name[20]; }; /* @@ -102,13 +93,15 @@ struct nfsdmap { */ struct nfsbuf { LIST_ENTRY(nfsbuf) nb_hash; /* hash chain */ - LIST_ENTRY(nfsbuf) nb_vnbufs; /* vnode's nfsbuf chain */ + LIST_ENTRY(nfsbuf) nb_vnbufs; /* nfsnode's nfsbuf chain */ TAILQ_ENTRY(nfsbuf) nb_free; /* free list position if not active. */ - volatile long nb_flags; /* NB_* flags. */ - volatile long nb_lflags; /* NBL_* flags. */ - volatile long nb_refs; /* outstanding references. */ - long nb_bufsize; /* buffer size */ + volatile uint32_t nb_flags; /* NB_* flags. */ + volatile uint32_t nb_lflags; /* NBL_* flags. */ + volatile uint32_t nb_refs; /* outstanding references. */ + uint32_t nb_bufsize; /* buffer size */ daddr64_t nb_lblkno; /* logical block number. */ + uint64_t nb_verf; /* V3 write verifier */ + int nb_commitlevel; /* lowest write commit level */ time_t nb_timestamp; /* buffer timestamp */ int nb_error; /* errno value. */ u_int32_t nb_valid; /* valid pages in buf */ @@ -117,18 +110,23 @@ struct nfsbuf { int nb_validend; /* offset of end of valid region. */ int nb_dirtyoff; /* offset in buffer of dirty region. */ int nb_dirtyend; /* offset of end of dirty region. */ + int nb_offio; /* offset in buffer of I/O region. */ + int nb_endio; /* offset of end of I/O region. */ + int nb_rpcs; /* Count of RPCs remaining for this buffer. */ caddr_t nb_data; /* mapped buffer */ - vnode_t nb_vp; /* device vnode */ - proc_t nb_proc; /* associated proc; NULL if kernel. */ - struct ucred * nb_rcred; /* read credentials reference */ - struct ucred * nb_wcred; /* write credentials reference */ + nfsnode_t nb_np; /* nfsnode buffer belongs to */ + kauth_cred_t nb_rcred; /* read credentials reference */ + kauth_cred_t nb_wcred; /* write credentials reference */ void * nb_pagelist; /* upl */ }; #define NFS_MAXBSIZE (32 * PAGE_SIZE) /* valid/dirty page masks limit buffer size */ +#define NFS_A_LOT_OF_NEEDCOMMITS 256 /* max# uncommitted buffers for a node */ +#define NFS_A_LOT_OF_DELAYED_WRITES MAX(nfsbufcnt/8,512) /* max# "delwri" buffers in system */ + /* - * These flags are kept in b_lflags... + * These flags are kept in b_lflags... * nfs_buf_mutex must be held before examining/updating */ #define NBL_BUSY 0x00000001 /* I/O in progress. */ @@ -139,23 +137,24 @@ struct nfsbuf { * very similar to the B_* flags for struct buf. * nfs_buf_mutex is not needed to examine/update these. */ -#define NB_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ +#define NB_STALEWVERF 0x00000001 /* write verifier changed on us */ +#define NB_NEEDCOMMIT 0x00000002 /* buffer needs to be committed */ #define NB_ASYNC 0x00000004 /* Start I/O, do not wait. */ -#define NB_CACHE 0x00000020 /* Bread found us in the cache. */ -#define NB_STABLE 0x00000040 /* write FILESYNC not UNSTABLE. */ -#define NB_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ +#define NB_CACHE 0x00000020 /* buffer data found in the cache */ +#define NB_STABLE 0x00000040 /* write FILESYNC not UNSTABLE */ +#define NB_DELWRI 0x00000080 /* delayed write: dirty range needs to be written */ #define NB_DONE 0x00000200 /* I/O completed. */ #define NB_EINTR 0x00000400 /* I/O was interrupted */ #define NB_ERROR 0x00000800 /* I/O error occurred. */ -#define NB_WASDIRTY 0x00001000 /* page was found dirty in the VM cache */ #define NB_INVAL 0x00002000 /* Does not contain valid info. */ +#define NB_NCRDAHEAD 0x00004000 /* "nocache readahead" data */ #define NB_NOCACHE 0x00008000 /* Do not cache block after use. */ +#define NB_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define NB_READ 0x00100000 /* Read buffer. */ +#define NB_MULTASYNCRPC 0x00200000 /* multiple async RPCs issued for buffer */ #define NB_PAGELIST 0x00400000 /* Buffer describes pagelist I/O. */ -#define NB_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define NB_WRITEINPROG 0x01000000 /* Write in progress. */ #define NB_META 0x40000000 /* buffer contains meta-data. */ -#define NB_IOD 0x80000000 /* buffer being handled by nfsiod. */ /* Flags for operation type in nfs_buf_get() */ #define NBLK_READ 0x00000001 /* buffer for read */ @@ -196,18 +195,15 @@ struct nfsbuf { LIST_HEAD(nfsbuflists, nfsbuf); TAILQ_HEAD(nfsbuffreehead, nfsbuf); -#define NFSNOLIST ((struct nfsbuf *)0xdeadbeef) - extern lck_mtx_t *nfs_buf_mutex; extern int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; extern int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; extern int nfs_nbdwrite; extern struct nfsbuffreehead nfsbuffree, nfsbufdelwri; -extern time_t nfsbuffreeuptimestamp; -#define NFSBUFCNTCHK(locked) \ +#ifdef NFSBUFDEBUG +#define NFSBUFCNTCHK() \ do { \ - if (!locked) lck_mtx_lock(nfs_buf_mutex); \ if ( (nfsbufcnt < 0) || \ (nfsbufcnt > nfsbufmax) || \ (nfsbufmetacnt < 0) || \ @@ -230,78 +226,415 @@ extern time_t nfsbuffreeuptimestamp; panic("nfsbuf count error: max %d meta %d cnt %d meta %d free %d meta %d delwr %d bdw %d\n", \ nfsbufmax, nfsbufmetamax, nfsbufcnt, nfsbufmetacnt, nfsbuffreecnt, nfsbuffreemetacnt, \ nfsbufdelwricnt, nfs_nbdwrite); \ - if (!locked) lck_mtx_unlock(nfs_buf_mutex); \ + } while (0) +#else +#define NFSBUFCNTCHK() +#endif + +/* + * NFS directory buffer + * + * Each buffer for a directory consists of: + * + * - a small header + * - a packed list of direntry structures + * (if RDIRPLUS is enabled, a file handle and attrstamp are + * packed after the direntry name.) + * - free/unused space + * - if RDIRPLUS is enabled, an array of attributes + * that is indexed backwards from the end of the buffer. + */ +struct nfs_dir_buf_header { + uint16_t ndbh_flags; /* flags (see below) */ + uint16_t ndbh_count; /* # of entries */ + uint32_t ndbh_entry_end; /* end offset of direntry data */ + uint32_t ndbh_ncgen; /* name cache generation# */ + uint32_t ndbh_pad; /* reserved */ +}; +/* ndbh_flags */ +#define NDB_FULL 0x0001 /* buffer has been filled */ +#define NDB_EOF 0x0002 /* buffer contains EOF */ +#define NDB_PLUS 0x0004 /* buffer contains RDIRPLUS data */ + +#define NFS_DIR_BUF_FIRST_DIRENTRY(BP) \ + ((struct direntry*)((char*)((BP)->nb_data) + sizeof(*ndbhp))) +#define NFS_DIR_BUF_NVATTR(BP, IDX) \ + (&((struct nfs_vattr*)((char*)((BP)->nb_data) + (BP)->nb_bufsize))[-((IDX)+1)]) +#define NFS_DIRENTRY_LEN(namlen) \ + ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7) +#define NFS_DIRENT_LEN(namlen) \ + ((sizeof(struct dirent) - (NAME_MAX+1)) + (((namlen) + 1 + 3) &~ 3)) +#define NFS_DIRENTRY_NEXT(DP) \ + ((struct direntry*)((char*)(DP) + (DP)->d_reclen)) +#define NFS_DIR_COOKIE_POTENTIALLY_TRUNCATED(C) \ + ((C) && ((((C) >> 32) == 0) || (((C) & 0x80000000ULL) && (((C) >> 32) == 0xffffffff)))) +#define NFS_DIR_COOKIE_SAME32(C1, C2) \ + (((C1) & 0xffffffffULL) == ((C2) & 0xffffffffULL)) + +/* + * NFS directory cookie cache + * + * This structure is used to cache cookie-to-buffer mappings for + * cookies recently returned from READDIR. The entries are kept in an + * array. The most-recently-used (MRU) list is headed by the entry at + * index "mru". The index of the next entry in the list is kept in the + * "next" array. (An index value of -1 marks an invalid entry.) + */ +#define NFSNUMCOOKIES 14 +struct nfsdmap { + int8_t free; /* next unused slot */ + int8_t mru; /* head of MRU list */ + int8_t next[NFSNUMCOOKIES]; /* MRU list links */ + struct { + uint64_t key; /* cookie */ + uint64_t lbn; /* lbn of buffer */ + } cookies[NFSNUMCOOKIES]; /* MRU list entries */ +}; + +/* + * NFS vnode attribute structure + */ +#define NFSTIME_ACCESS 0 /* time of last access */ +#define NFSTIME_MODIFY 1 /* time of last modification */ +#define NFSTIME_CHANGE 2 /* time file changed */ +#define NFSTIME_CREATE 3 /* time file created */ +#define NFSTIME_BACKUP 4 /* time of last backup */ +#define NFSTIME_COUNT 5 + +#define NFS_COMPARE_MTIME(TVP, NVAP, CMP) \ + (((TVP)->tv_sec == (NVAP)->nva_timesec[NFSTIME_MODIFY]) ? \ + ((TVP)->tv_nsec CMP (NVAP)->nva_timensec[NFSTIME_MODIFY]) : \ + ((TVP)->tv_sec CMP (NVAP)->nva_timesec[NFSTIME_MODIFY])) +#define NFS_COPY_TIME(TVP, NVAP, WHICH) \ + do { \ + (TVP)->tv_sec = (NVAP)->nva_timesec[NFSTIME_##WHICH]; \ + (TVP)->tv_nsec = (NVAP)->nva_timensec[NFSTIME_##WHICH]; \ } while (0) struct nfs_vattr { enum vtype nva_type; /* vnode type (for create) */ - u_short nva_mode; /* files access mode and type */ - dev_t nva_rdev; /* device the special file represents */ + uint32_t nva_mode; /* file's access mode (and type) */ uid_t nva_uid; /* owner user id */ gid_t nva_gid; /* owner group id */ - uint32_t nva_fsid; /* file system id (dev for now) */ - uint64_t nva_nlink; /* number of references to file */ + guid_t nva_uuuid; /* owner user UUID */ + guid_t nva_guuid; /* owner group UUID */ + kauth_acl_t nva_acl; /* access control list */ + nfs_specdata nva_rawdev; /* device the special file represents */ + uint32_t nva_flags; /* file flags (see below) */ + uint32_t nva_maxlink; /* maximum # of links (v4) */ + uint64_t nva_nlink; /* number of references to file */ uint64_t nva_fileid; /* file id */ + nfs_fsid nva_fsid; /* file system id */ uint64_t nva_size; /* file size in bytes */ uint64_t nva_bytes; /* bytes of disk space held by file */ - uint32_t nva_blocksize; /* blocksize preferred for i/o */ - struct timespec nva_atime; /* time of last access */ - struct timespec nva_mtime; /* time of last modification */ - struct timespec nva_ctime; /* time file changed */ + uint64_t nva_change; /* change attribute */ + int64_t nva_timesec[NFSTIME_COUNT]; + int32_t nva_timensec[NFSTIME_COUNT]; + uint32_t nva_bitmap[NFS_ATTR_BITMAP_LEN]; /* attributes that are valid */ +}; + +/* nva_flags */ +#define NFS_FFLAG_ARCHIVED 0x0001 +#define NFS_FFLAG_HIDDEN 0x0002 +#define NFS_FFLAG_HAS_NAMED_ATTRS 0x0004 /* file has named attributes */ +#define NFS_FFLAG_TRIGGER 0x0008 /* node is a trigger/mirror mount point */ +#define NFS_FFLAG_TRIGGER_REFERRAL 0x0010 /* trigger is a referral */ +#define NFS_FFLAG_IS_ATTR 0x8000 /* file is a named attribute file/directory */ + +/* flags for nfs_getattr() */ +#define NGA_CACHED 0x0001 /* use cached attributes (if still valid) */ +#define NGA_UNCACHED 0x0002 /* fetch new attributes */ +#define NGA_ACL 0x0004 /* fetch ACL */ +#define NGA_MONITOR 0x0008 /* vnode monitor attr update poll */ +#define NGA_SOFT 0x0010 /* use cached attributes if ETIMEOUT */ + +/* macros for initting/cleaning up nfs_vattr structures */ +#define NVATTR_INIT(NVAP) \ + do { \ + NFS_CLEAR_ATTRIBUTES((NVAP)->nva_bitmap); \ + (NVAP)->nva_flags = 0; \ + (NVAP)->nva_acl = NULL; \ + } while (0) +#define NVATTR_CLEANUP(NVAP) \ + do { \ + NFS_CLEAR_ATTRIBUTES((NVAP)->nva_bitmap); \ + if ((NVAP)->nva_acl) { \ + kauth_acl_free((NVAP)->nva_acl); \ + (NVAP)->nva_acl = NULL; \ + } \ + } while (0) + +/* + * macros for detecting node changes + * + * These macros help us determine if a file has been changed on the server and + * thus whether or not we need to invalidate any cached data. + * + * For NFSv2/v3, the modification time is used. + * For NFSv4, the change attribute is used. + */ +#define NFS_CHANGED(VERS, NP, NVAP) \ + (((VERS) >= NFS_VER4) ? \ + ((NP)->n_change != (NVAP)->nva_change) : \ + NFS_COMPARE_MTIME(&(NP)->n_mtime, (NVAP), !=)) +#define NFS_CHANGED_NC(VERS, NP, NVAP) \ + (((VERS) >= NFS_VER4) ? \ + ((NP)->n_ncchange != (NVAP)->nva_change) : \ + NFS_COMPARE_MTIME(&(NP)->n_ncmtime, (NVAP), !=)) +#define NFS_CHANGED_UPDATE(VERS, NP, NVAP) \ + do { \ + if ((VERS) >= NFS_VER4) \ + (NP)->n_change = (NVAP)->nva_change; \ + else \ + NFS_COPY_TIME(&(NP)->n_mtime, (NVAP), MODIFY); \ + } while (0) +#define NFS_CHANGED_UPDATE_NC(VERS, NP, NVAP) \ + do { \ + if ((VERS) >= NFS_VER4) \ + (NP)->n_ncchange = (NVAP)->nva_change; \ + else \ + NFS_COPY_TIME(&(NP)->n_ncmtime, (NVAP), MODIFY); \ + } while (0) + + +extern lck_grp_t *nfs_open_grp; +extern uint32_t nfs_open_owner_seqnum, nfs_lock_owner_seqnum; + +/* + * NFSv4 open owner structure - one per cred per mount + */ +struct nfs_open_owner { + TAILQ_ENTRY(nfs_open_owner) noo_link; /* List of open owners (on mount) */ + lck_mtx_t noo_lock; /* owner mutex */ + struct nfsmount * noo_mount; /* NFS mount */ + uint32_t noo_refcnt; /* # outstanding references */ + uint32_t noo_flags; /* see below */ + kauth_cred_t noo_cred; /* credentials of open owner */ + uint32_t noo_name; /* unique name used otw */ + uint32_t noo_seqid; /* client-side sequence ID */ + TAILQ_HEAD(,nfs_open_file) noo_opens; /* list of open files */ }; +/* noo_flags */ +#define NFS_OPEN_OWNER_LINK 0x1 /* linked into mount's open owner list */ +#define NFS_OPEN_OWNER_BUSY 0x2 /* open state-modifying operation in progress */ +#define NFS_OPEN_OWNER_WANT 0x4 /* someone else wants to mark busy */ /* - * The nfsnode is the nfs equivalent to ufs's inode. Any similarity - * is purely coincidental. - * There is a unique nfsnode allocated for each active file, - * each current directory, each mounted-on file, text file, and the root. + * NFS open file structure - one per open owner per nfsnode + */ +struct nfs_open_file { + lck_mtx_t nof_lock; /* open file mutex */ + TAILQ_ENTRY(nfs_open_file) nof_link; /* list of open files */ + TAILQ_ENTRY(nfs_open_file) nof_oolink; /* list of open owner's open files */ + struct nfs_open_owner * nof_owner; /* open owner */ + nfsnode_t nof_np; /* nfsnode this open is for */ + nfs_stateid nof_stateid; /* open stateid */ + thread_t nof_creator; /* thread that created file */ + uint32_t nof_opencnt; /* open file count */ + uint16_t nof_flags; /* see below */ + uint8_t nof_access:4; /* access mode for this open */ + uint8_t nof_deny:4; /* deny mode for this open */ + uint8_t nof_mmap_access:4; /* mmap open access mode */ + uint8_t nof_mmap_deny:4; /* mmap open deny mode */ + /* counts of access/deny mode open combinations */ + uint32_t nof_r; /* read opens (deny none) */ + uint32_t nof_w; /* write opens (deny none) */ + uint32_t nof_rw; /* read/write opens (deny none) */ + uint32_t nof_r_dw; /* read deny-write opens */ + /* the rest of the counts have a max of 2 (1 for open + 1 for mmap) */ + uint32_t nof_w_dw:2; /* write deny-write opens (max 2) */ + uint32_t nof_rw_dw:2; /* read/write deny-write opens (max 2) */ + uint32_t nof_r_drw:2; /* read deny-read/write opens (max 2) */ + uint32_t nof_w_drw:2; /* write deny-read/write opens (max 2) */ + uint32_t nof_rw_drw:2; /* read/write deny-read/write opens (max 2) */ + /* counts of DELEGATED access/deny mode open combinations */ + uint32_t nof_d_w_dw:2; /* write deny-write opens (max 2) */ + uint32_t nof_d_rw_dw:2; /* read/write deny-write opens (max 2) */ + uint32_t nof_d_r_drw:2; /* read deny-read/write opens (max 2) */ + uint32_t nof_d_w_drw:2; /* write deny-read/write opens (max 2) */ + uint32_t nof_d_rw_drw:2; /* read/write deny-read/write opens (max 2) */ + uint32_t nof_d_r; /* read opens (deny none) */ + uint32_t nof_d_w; /* write opens (deny none) */ + uint32_t nof_d_rw; /* read/write opens (deny none) */ + uint32_t nof_d_r_dw; /* read deny-write opens */ +}; +/* nof_flags */ +#define NFS_OPEN_FILE_BUSY 0x0001 /* open state-modifying operation in progress */ +#define NFS_OPEN_FILE_WANT 0x0002 /* someone else wants to mark busy */ +#define NFS_OPEN_FILE_CREATE 0x0004 /* has an open(RW) from a "CREATE" call */ +#define NFS_OPEN_FILE_NEEDCLOSE 0x0008 /* has an open(R) from an (unopen) VNOP_READ or VNOP_MMAP call */ +#define NFS_OPEN_FILE_SETATTR 0x0020 /* has an open(W) to perform a SETATTR(size) */ +#define NFS_OPEN_FILE_POSIXLOCK 0x0040 /* server supports POSIX locking semantics */ +#define NFS_OPEN_FILE_LOST 0x0080 /* open state has been lost */ +#define NFS_OPEN_FILE_REOPEN 0x0100 /* file needs to be reopened */ +#define NFS_OPEN_FILE_REOPENING 0x0200 /* file is being reopened */ + +struct nfs_lock_owner; +/* + * NFS file lock + * + * Each lock request (pending or granted) has an + * nfs_file_lock structure representing its state. + */ +struct nfs_file_lock { + TAILQ_ENTRY(nfs_file_lock) nfl_link; /* List of locks on nfsnode */ + TAILQ_ENTRY(nfs_file_lock) nfl_lolink; /* List of locks held by locker */ + struct nfs_lock_owner * nfl_owner; /* lock owner that holds this lock */ + uint64_t nfl_start; /* starting offset */ + uint64_t nfl_end; /* ending offset (inclusive) */ + uint32_t nfl_blockcnt; /* # locks blocked on this lock */ + uint16_t nfl_flags; /* see below */ + uint8_t nfl_type; /* lock type: read/write */ +}; +/* nfl_flags */ +#define NFS_FILE_LOCK_ALLOC 0x01 /* lock was allocated */ +#define NFS_FILE_LOCK_STYLE_POSIX 0x02 /* POSIX-style fcntl() lock */ +#define NFS_FILE_LOCK_STYLE_FLOCK 0x04 /* flock(2)-style lock */ +#define NFS_FILE_LOCK_STYLE_MASK 0x06 /* lock style mask */ +#define NFS_FILE_LOCK_WAIT 0x08 /* may block on conflicting locks */ +#define NFS_FILE_LOCK_BLOCKED 0x10 /* request is blocked */ +#define NFS_FILE_LOCK_DEAD 0x20 /* lock (request) no longer exists */ +#define NFS_FILE_LOCK_DELEGATED 0x40 /* lock acquired via delegation */ + +TAILQ_HEAD(nfs_file_lock_queue, nfs_file_lock); + +/* + * Calculate length of lock range given the endpoints. + * Note that struct flock has "to EOF" reported as 0 but + * the NFSv4 protocol has "to EOF" reported as UINT64_MAX. + */ +#define NFS_FLOCK_LENGTH(S, E) (((E) == UINT64_MAX) ? 0 : ((E) - (S) + 1)) +#define NFS_LOCK_LENGTH(S, E) (((E) == UINT64_MAX) ? UINT64_MAX : ((E) - (S) + 1)) + +/* + * NFSv4 lock owner structure - per open owner per process per nfsnode + * + * A lock owner is a process + an nfsnode. + * + * Note that flock(2) locks technically should have the lock owner be + * an fglob pointer instead of a process. However, implementing that + * correctly would not be trivial. So, for now, flock(2) locks are + * essentially treated like whole-file POSIX locks. + */ +struct nfs_lock_owner { + lck_mtx_t nlo_lock; /* owner mutex */ + TAILQ_ENTRY(nfs_lock_owner) nlo_link; /* List of lock owners (on nfsnode) */ + struct nfs_open_owner * nlo_open_owner; /* corresponding open owner */ + struct nfs_file_lock_queue nlo_locks; /* list of locks held */ + struct nfs_file_lock nlo_alock; /* most lockers will only ever have one */ + struct timeval nlo_pid_start; /* Start time of process id */ + pid_t nlo_pid; /* lock-owning process ID */ + uint32_t nlo_refcnt; /* # outstanding references */ + uint32_t nlo_flags; /* see below */ + uint32_t nlo_name; /* unique name used otw */ + uint32_t nlo_seqid; /* client-side sequence ID */ + uint32_t nlo_stategenid; /* mount state generation ID */ + nfs_stateid nlo_stateid; /* lock stateid */ +}; +/* nlo_flags */ +#define NFS_LOCK_OWNER_LINK 0x1 /* linked into mount's lock owner list */ +#define NFS_LOCK_OWNER_BUSY 0x2 /* lock state-modifying operation in progress */ +#define NFS_LOCK_OWNER_WANT 0x4 /* someone else wants to mark busy */ + +/* + * The nfsnode is the NFS equivalent of an inode. + * There is a unique nfsnode for each NFS vnode. * An nfsnode is 'named' by its file handle. (nget/nfs_node.c) - * If this structure exceeds 256 bytes (it is currently 256 using 4.4BSD-Lite - * type definitions), file handles of > 32 bytes should probably be split out - * into a separate MALLOC()'d data structure. (Reduce the size of nfsnode.n_fh - * by changing the definition in nfsproto.h of NFS_SMALLFH.) * NB: Hopefully the current order of the fields is such that everything will * be well aligned and, therefore, tightly packed. */ + +#define NFS_ACCESS_CACHE_SIZE 3 + struct nfsnode { + lck_mtx_t n_lock; /* nfs node lock */ + lck_rw_t n_datalock; /* nfs node data lock */ + void *n_datalockowner;/* nfs node data lock owner (exclusive) */ LIST_ENTRY(nfsnode) n_hash; /* Hash chain */ + LIST_ENTRY(nfsnode) n_monlink; /* list of monitored nodes */ u_quad_t n_size; /* Current size of file */ + u_quad_t n_newsize; /* new size of file (pending update) */ + u_int64_t n_xid; /* last xid to loadattr */ struct nfs_vattr n_vattr; /* Vnode attribute cache */ time_t n_attrstamp; /* Attr. cache timestamp */ - u_int32_t n_mode; /* ACCESS mode cache */ - uid_t n_modeuid; /* credentials having mode */ - time_t n_modestamp; /* mode cache timestamp */ - struct timespec n_mtime; /* Prev modify time. */ - struct timespec n_ncmtime; /* namecache modify time. */ - u_char *n_fhp; /* NFS File Handle */ + time_t n_aclstamp; /* ACL cache timestamp */ + time_t n_evtstamp; /* last vnode event timestamp */ + uint32_t n_events; /* pending vnode events */ + u_int8_t n_access[NFS_ACCESS_CACHE_SIZE+1]; /* ACCESS cache */ + uid_t n_accessuid[NFS_ACCESS_CACHE_SIZE]; /* credentials having access */ + time_t n_accessstamp[NFS_ACCESS_CACHE_SIZE]; /* access cache timestamp */ union { - vnode_t n_vp; /* associated vnode */ - mount_t n_mp; /* associated mount (NINIT) */ - } n_un0; - struct lockf *n_lockf; /* Locking record of file */ + struct { + struct timespec n3_mtime; /* Prev modify time. */ + struct timespec n3_ncmtime; /* namecache modify time. */ + } v3; + struct { + uint64_t n4_change; /* prev change attribute */ + uint64_t n4_ncchange; /* namecache change attribute */ + u_char *n4_attrdirfh; /* associated attr directory fh */ + struct timeval n4_lastio; /* time of most recent I/O on attr */ + } v4; + } n_un4; + vnode_t n_parent; /* this node's parent */ + u_char *n_fhp; /* NFS File Handle */ + vnode_t n_vnode; /* associated vnode */ + mount_t n_mount; /* associated mount (NHINIT) */ int n_error; /* Save write error value */ union { - struct timespec nf_atim; /* Special file times */ - nfsuint64 nd_cookieverf; /* Cookie verifier (dir only) */ + struct timespec ns_atim; /* Special file times */ + struct timespec nl_rltim; /* Time of last readlink */ + daddr64_t nf_lastread; /* last block# read from (for readahead) */ + uint64_t nd_cookieverf; /* Cookie verifier (dir only) */ } n_un1; union { - struct timespec nf_mtim; - off_t nd_direof; /* Dir. EOF offset cache */ + struct timespec ns_mtim; /* Special file times */ + daddr64_t nf_lastrahead; /* last block# read ahead */ + uint64_t nd_eofcookie; /* Dir. EOF cookie cache */ } n_un2; union { - struct sillyrename *nf_silly; /* Ptr to silly rename struct */ - LIST_HEAD(, nfsdmap) nd_cook; /* cookies */ + struct nfs_sillyrename *nf_silly;/* Ptr to silly rename struct */ + struct nfsdmap *nd_cookiecache; /* dir cookie cache */ } n_un3; - short n_fhsize; /* size in bytes, of fh */ - short n_flag; /* Flag for locking.. */ + uint32_t n_flag; /* node flags */ + u_short n_fhsize; /* size in bytes, of fh */ + u_short n_hflag; /* node hash flags */ + u_short n_bflag; /* node buffer flags */ + u_short n_mflag; /* node mount flags */ u_char n_fh[NFS_SMALLFH];/* Small File Handle */ - u_int64_t n_xid; /* last xid to loadattr */ + uint32_t n_auth; /* security flavor used for this node */ struct nfsbuflists n_cleanblkhd; /* clean blocklist head */ struct nfsbuflists n_dirtyblkhd; /* dirty blocklist head */ - int n_needcommitcnt;/* # bufs that need committing */ + union { + int nf_wrbusy; /* # threads in write/fsync */ + uint32_t nd_ncgen; /* dir name cache generation# */ + } n_un5; + union { + int nf_needcommitcnt;/* # bufs that need committing */ + daddr64_t nd_lastdbl; /* last dir buf lookup block# */ + } n_un6; int n_bufiterflags; /* buf iterator flags */ + union { + int nf_numoutput; /* write I/Os in progress */ + int nd_trigseq; /* vnode trigger seq# */ + } n_un7; + /* open state */ + lck_mtx_t n_openlock; /* nfs node open lock */ + uint32_t n_openflags; /* open state flags */ + uint32_t n_openrefcnt; /* # non-file opens */ + TAILQ_HEAD(,nfs_open_file) n_opens; /* list of open files */ + /* lock state */ + TAILQ_HEAD(, nfs_lock_owner) n_lock_owners; /* list of lock owners */ + struct nfs_file_lock_queue n_locks; /* list of locks */ + /* delegation state */ + nfs_stateid n_dstateid; /* delegation stateid */ + TAILQ_ENTRY(nfsnode) n_dlink; /* delegation list link */ + TAILQ_ENTRY(nfsnode) n_dreturn; /* delegation return list link */ + struct kauth_ace n_dace; /* delegation ACE */ }; +#define NFS_DATA_LOCK_SHARED 1 +#define NFS_DATA_LOCK_EXCLUSIVE 2 + #define nfstimespeccmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ @@ -315,39 +648,100 @@ struct nfsnode { } \ } while (0) -#define n_vnode n_un0.n_vp -#define n_mount n_un0.n_mp -#define n_atim n_un1.nf_atim -#define n_mtim n_un2.nf_mtim -#define n_sillyrename n_un3.nf_silly -#define n_cookieverf n_un1.nd_cookieverf -#define n_direofoffset n_un2.nd_direof -#define n_cookies n_un3.nd_cook +#define n_atim n_un1.ns_atim +#define n_mtim n_un2.ns_mtim +#define n_rltim n_un1.nl_rltim +#define n_lastread n_un1.nf_lastread +#define n_lastrahead n_un2.nf_lastrahead +#define n_sillyrename n_un3.nf_silly +#define n_wrbusy n_un5.nf_wrbusy +#define n_needcommitcnt n_un6.nf_needcommitcnt +#define n_numoutput n_un7.nf_numoutput +#define n_cookieverf n_un1.nd_cookieverf +#define n_eofcookie n_un2.nd_eofcookie +#define n_cookiecache n_un3.nd_cookiecache +#define n_ncgen n_un5.nd_ncgen +#define n_lastdbl n_un6.nd_lastdbl +#define n_trigseq n_un7.nd_trigseq +#define n_mtime n_un4.v3.n3_mtime +#define n_ncmtime n_un4.v3.n3_ncmtime +#define n_change n_un4.v4.n4_change +#define n_ncchange n_un4.v4.n4_ncchange +#define n_attrdirfh n_un4.v4.n4_attrdirfh +#define n_lastio n_un4.v4.n4_lastio /* * Flags for n_flag */ -#define NFLUSHWANT 0x0001 /* Want wakeup from a flush in prog. */ -#define NFLUSHINPROG 0x0002 /* Avoid multiple calls to vinvalbuf() */ -#define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ -#define NWRITEERR 0x0008 /* Flag write errors so close will know */ -#define NNEEDINVALIDATE 0x0010 /* need to call vinvalbuf() */ -#define NNOCACHE 0x0020 /* all bufs are uncached */ -#define NWRBUSY 0x0040 /* node in write/fsync */ -#define NACC 0x0100 /* Special file accessed */ -#define NUPD 0x0200 /* Special file updated */ -#define NCHG 0x0400 /* Special file times changed */ -#define NHASHED 0x1000 /* someone wants to lock */ -#define NINIT 0x2000 /* node is being initialized */ -#define NWINIT 0x4000 /* someone waiting for init to complete */ +#define NUPDATESIZE 0x00001 /* size of file needs updating */ +#define NREVOKE 0x00002 /* node revoked */ +#define NMODIFIED 0x00004 /* Might have a modified buffer in bio */ +#define NWRITEERR 0x00008 /* Flag write errors so close will know */ +#define NNEEDINVALIDATE 0x00010 /* need to call vinvalbuf() */ +#define NGETATTRINPROG 0x00020 /* GETATTR RPC in progress */ +#define NGETATTRWANT 0x00040 /* waiting for GETATTR RPC */ +#define NACC 0x00100 /* Special file accessed */ +#define NUPD 0x00200 /* Special file updated */ +#define NCHG 0x00400 /* Special file times changed */ +#define NNEGNCENTRIES 0x00800 /* directory has negative name cache entries */ +#define NBUSY 0x01000 /* node is busy */ +#define NBUSYWANT 0x02000 /* waiting on busy node */ +#define NISDOTZFS 0x04000 /* a ".zfs" directory */ +#define NISDOTZFSCHILD 0x08000 /* a child of a ".zfs" directory */ +#define NISMAPPED 0x10000 /* node is mmapped */ +#define NREFRESH 0x20000 /* node's fh needs to be refreshed */ +#define NREFRESHWANT 0x40000 /* Waiting for fh to be refreshed */ + +/* + * Flags for n_hflag + * Note: protected by nfs_node_hash_mutex + */ +#define NHHASHED 0x0001 /* node is in hash table */ +#define NHINIT 0x0002 /* node is being initialized */ +#define NHLOCKED 0x0004 /* node is locked (initting or deleting) */ +#define NHLOCKWANT 0x0008 /* someone wants to lock */ + +/* + * Flags for n_bflag + * Note: protected by nfs_buf_mutex + */ +#define NBFLUSHINPROG 0x0001 /* Avoid multiple calls to nfs_flush() */ +#define NBFLUSHWANT 0x0002 /* waiting for nfs_flush() to complete */ +#define NBINVALINPROG 0x0004 /* Avoid multiple calls to nfs_vinvalbuf() */ +#define NBINVALWANT 0x0008 /* waiting for nfs_vinvalbuf() to complete */ + +/* + * Flags for n_mflag + * Note: protected by nfsmount's nm_lock + */ +#define NMMONSCANINPROG 0x0001 /* monitored node is currently updating attributes */ +#define NMMONSCANWANT 0x0002 /* waiting for attribute update to complete */ +/* + * n_openflags + * Note: protected by n_openlock + */ +#define N_OPENBUSY 0x0001 /* open state is busy - being updated */ +#define N_OPENWANT 0x0002 /* someone wants to mark busy */ +#define N_DELEG_READ 0x0004 /* we have a read delegation */ +#define N_DELEG_WRITE 0x0008 /* we have a write delegation */ +#define N_DELEG_MASK 0x000c /* delegation mask */ +#define N_DELEG_RETURN 0x0010 /* delegation queued for return */ +#define N_DELEG_RETURNING 0x0020 /* delegation being returned */ + +/* attr/access/ACL cache timestamp macros */ #define NATTRVALID(np) ((np)->n_attrstamp != ~0) #define NATTRINVALIDATE(np) ((np)->n_attrstamp = ~0) -#define NMODEVALID(np) ((np)->n_modestamp != ~0) -#define NMODEINVALIDATE(np) ((np)->n_modestamp = ~0) - -#define NVALIDBUFS(np) (!LIST_EMPTY(&(np)->n_dirtyblkhd) || \ - !LIST_EMPTY(&(np)->n_cleanblkhd)) +#define NACCESSVALID(np, slot) (((slot) >= 0) && ((slot) < NFS_ACCESS_CACHE_SIZE) && ((np)->n_accessstamp[(slot)] != ~0)) +#define NACCESSINVALIDATE(np) \ + do { \ + int __i; \ + for (__i=0; __i < NFS_ACCESS_CACHE_SIZE; __i++) \ + (np)->n_accessstamp[__i] = ~0; \ + (np)->n_access[NFS_ACCESS_CACHE_SIZE] = 0; \ + } while (0) +#define NACLVALID(np) ((np)->n_aclstamp != ~0) +#define NACLINVALIDATE(np) ((np)->n_aclstamp = ~0) /* * NFS-specific flags for nfs_vinvalbuf/nfs_flush @@ -359,25 +753,39 @@ struct nfsnode { */ #define NG_MARKROOT 0x0001 /* mark vnode as root of FS */ #define NG_MAKEENTRY 0x0002 /* add name cache entry for vnode */ +#define NG_NOCREATE 0x0004 /* don't create a new node, return existing one */ /* * Convert between nfsnode pointers and vnode pointers */ -#define VTONFS(vp) ((struct nfsnode *)vnode_fsnode(vp)) +#define VTONFS(vp) ((nfsnode_t)vnode_fsnode(vp)) #define NFSTOV(np) ((np)->n_vnode) /* nfsnode hash table mutex */ extern lck_mtx_t *nfs_node_hash_mutex; +/* + * printf-like helper macro that also outputs node name. + */ +#define NP(NP, FMT, ...) \ + do { \ + const char *__vname = (NP) ? vnode_getname(NFSTOV(NP)) : NULL; \ + printf(FMT " %s\n", ##__VA_ARGS__, __vname ? __vname : "???"); \ + if (__vname) vnode_putname(__vname); \ + } while (0) + /* * nfsiod structures */ -extern proc_t nfs_iodwant[NFS_MAXASYNCDAEMON]; -extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; -extern lck_grp_t *nfs_iod_lck_grp; -extern lck_grp_attr_t *nfs_iod_lck_grp_attr; -extern lck_attr_t *nfs_iod_lck_attr; -extern lck_mtx_t *nfs_iod_mutex; +struct nfsiod { + TAILQ_ENTRY(nfsiod) niod_link; /* List of nfsiods */ + struct nfsmount * niod_nmp; /* mount point for this nfsiod */ +}; +TAILQ_HEAD(nfsiodlist, nfsiod); +TAILQ_HEAD(nfsiodmountlist, nfsmount); +extern struct nfsiodlist nfsiodfree, nfsiodwork; +extern struct nfsiodmountlist nfsiodmounts; +extern lck_mtx_t *nfsiod_mutex; #if defined(KERNEL) @@ -385,42 +793,97 @@ typedef int vnop_t(void *); extern vnop_t **fifo_nfsv2nodeop_p; extern vnop_t **nfsv2_vnodeop_p; extern vnop_t **spec_nfsv2nodeop_p; +extern vnop_t **fifo_nfsv4nodeop_p; +extern vnop_t **nfsv4_vnodeop_p; +extern vnop_t **spec_nfsv4nodeop_p; /* * Prototypes for NFS vnode operations */ -int nfs_write(struct vnop_write_args *); -#define nfs_revoke nop_revoke -#define nfs_seek ((int (*)(struct vnop_seek_args *))nullop) //XXXdead? -int nfs_inactive(struct vnop_inactive_args *); -int nfs_reclaim(struct vnop_reclaim_args *); - +#define nfs_vnop_revoke nop_revoke +int nfs_vnop_inactive(struct vnop_inactive_args *); +int nfs_vnop_reclaim(struct vnop_reclaim_args *); + +int nfs_node_lock(nfsnode_t); +int nfs_node_lock_internal(nfsnode_t, int); +void nfs_node_lock_force(nfsnode_t); +void nfs_node_unlock(nfsnode_t); +int nfs_node_lock2(nfsnode_t, nfsnode_t); +void nfs_node_unlock2(nfsnode_t, nfsnode_t); +int nfs_node_set_busy(nfsnode_t, thread_t); +int nfs_node_set_busy2(nfsnode_t, nfsnode_t, thread_t); +int nfs_node_set_busy4(nfsnode_t, nfsnode_t, nfsnode_t, nfsnode_t, thread_t); +void nfs_node_clear_busy(nfsnode_t); +void nfs_node_clear_busy2(nfsnode_t, nfsnode_t); +void nfs_node_clear_busy4(nfsnode_t, nfsnode_t, nfsnode_t, nfsnode_t); +void nfs_data_lock(nfsnode_t, int); +void nfs_data_lock_noupdate(nfsnode_t, int); +void nfs_data_lock_internal(nfsnode_t, int, int); +void nfs_data_unlock(nfsnode_t); +void nfs_data_unlock_noupdate(nfsnode_t); +void nfs_data_unlock_internal(nfsnode_t, int); +void nfs_data_update_size(nfsnode_t, int); /* other stuff */ -int nfs_removeit(struct sillyrename *); -int nfs_nget(mount_t,vnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,int,struct nfsnode **); -nfsuint64 *nfs_getcookie(struct nfsnode *, off_t, int); -void nfs_invaldir(vnode_t); +int nfs_removeit(struct nfs_sillyrename *); +int nfs_nget(mount_t,nfsnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,uint32_t,int,nfsnode_t*); +int nfs_mount_is_dirty(mount_t); +void nfs_dir_cookie_cache(nfsnode_t, uint64_t, uint64_t); +int nfs_dir_cookie_to_lbn(nfsnode_t, uint64_t, int *, uint64_t *); +void nfs_invaldir(nfsnode_t); +uint32_t nfs_dir_buf_freespace(struct nfsbuf *, int); /* nfsbuf functions */ void nfs_nbinit(void); +void nfs_buf_timer(void *, void *); void nfs_buf_remfree(struct nfsbuf *); -boolean_t nfs_buf_is_incore(vnode_t, daddr64_t); -struct nfsbuf * nfs_buf_incore(vnode_t, daddr64_t); -int nfs_buf_get(vnode_t, daddr64_t, int, proc_t, int, struct nfsbuf **); +boolean_t nfs_buf_is_incore(nfsnode_t, daddr64_t); +struct nfsbuf * nfs_buf_incore(nfsnode_t, daddr64_t); +int nfs_buf_get(nfsnode_t, daddr64_t, uint32_t, thread_t, int, struct nfsbuf **); int nfs_buf_upl_setup(struct nfsbuf *bp); void nfs_buf_upl_check(struct nfsbuf *bp); +void nfs_buf_normalize_valid_range(nfsnode_t, struct nfsbuf *); +int nfs_buf_map(struct nfsbuf *); void nfs_buf_release(struct nfsbuf *, int); int nfs_buf_iowait(struct nfsbuf *); void nfs_buf_iodone(struct nfsbuf *); -void nfs_buf_write_delayed(struct nfsbuf *, proc_t); +void nfs_buf_write_delayed(struct nfsbuf *); +void nfs_buf_check_write_verifier(nfsnode_t, struct nfsbuf *); void nfs_buf_freeup(int); void nfs_buf_refget(struct nfsbuf *bp); void nfs_buf_refrele(struct nfsbuf *bp); void nfs_buf_drop(struct nfsbuf *); errno_t nfs_buf_acquire(struct nfsbuf *, int, int, int); -int nfs_buf_iterprepare(struct nfsnode *, struct nfsbuflists *, int); -void nfs_buf_itercomplete(struct nfsnode *, struct nfsbuflists *, int); +int nfs_buf_iterprepare(nfsnode_t, struct nfsbuflists *, int); +void nfs_buf_itercomplete(nfsnode_t, struct nfsbuflists *, int); + +int nfs_bioread(nfsnode_t, uio_t, int, vfs_context_t); +int nfs_buf_readahead(nfsnode_t, int, daddr64_t *, daddr64_t, thread_t, kauth_cred_t); +int nfs_buf_readdir(struct nfsbuf *, vfs_context_t); +int nfs_buf_read(struct nfsbuf *); +void nfs_buf_read_finish(struct nfsbuf *); +int nfs_buf_read_rpc(struct nfsbuf *, thread_t, kauth_cred_t); +void nfs_buf_read_rpc_finish(struct nfsreq *); +int nfs_buf_write(struct nfsbuf *); +void nfs_buf_write_finish(struct nfsbuf *, thread_t, kauth_cred_t); +int nfs_buf_write_rpc(struct nfsbuf *, int, thread_t, kauth_cred_t); +void nfs_buf_write_rpc_finish(struct nfsreq *); +int nfs_buf_write_dirty_pages(struct nfsbuf *, thread_t, kauth_cred_t); + +int nfs_flushcommits(nfsnode_t, int); +int nfs_flush(nfsnode_t, int, thread_t, int); +void nfs_buf_delwri_push(int); +void nfs_buf_delwri_service(void); +void nfs_buf_delwri_thread(void *, wait_result_t);; + +int nfsiod_start(void); +void nfsiod_terminate(struct nfsiod *); +void nfsiod_thread(void); +int nfsiod_continue(int); +void nfs_asyncio_finish(struct nfsreq *); +void nfs_asyncio_resend(struct nfsreq *); +int nfs_async_write_start(struct nfsmount *); +void nfs_async_write_done(struct nfsmount *); #endif /* KERNEL */