bsd/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from software contributed to Berkeley by
  33  * Scooter Morris at Genentech Inc.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  60  */
  61
  62 #include <sys/cdefs.h>
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/lock.h>
  67 #include <sys/mount.h>
  68 #include <sys/proc.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/unistd.h>
  71 #include <sys/user.h>
  72 #include <sys/vnode.h>
  73 #include <sys/vnode_internal.h>
  74 #include <sys/vnode_if.h>
  75 #include <sys/malloc.h>
  76 #include <sys/fcntl.h>
  77 #include <sys/lockf.h>
  78 #include <sys/sdt.h>
  79 #include <kern/policy_internal.h>
  80
  81 #include <sys/file_internal.h>
  82
  83 #if (DEVELOPMENT || DEBUG)
  84 #define LOCKF_DEBUGGING 1
  85 #endif
  86
  87 #ifdef LOCKF_DEBUGGING
  88 #include <sys/sysctl.h>
  89 void lf_print(const char *tag, struct lockf *lock);
  90 void lf_printlist(const char *tag, struct lockf *lock);
  91
  92 #define LF_DBG_LOCKOP   (1 << 0)        /* setlk, getlk, clearlk */
  93 #define LF_DBG_LIST     (1 << 1)        /* split, coalesce */
  94 #define LF_DBG_IMPINH   (1 << 2)        /* importance inheritance */
  95 #define LF_DBG_TRACE    (1 << 3)        /* errors, exit */
  96 #define LF_DBG_DEADLOCK (1 << 4)        /* deadlock detection */
  97
  98 static int      lockf_debug = 0;        /* was 2, could be 3 ;-) */
  99 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
 100
 101 /*
 102  * If there is no mask bit selector, or there is one, and the selector is
 103  * set, then output the debugging diagnostic.
 104  */
 105 #define LOCKF_DEBUG(mask, ...)                                  \
 106         do {                                                    \
 107                 if (!(mask) || ((mask) & lockf_debug)) {        \
 108                         printf("%s>", __FUNCTION__);            \
 109                         printf(__VA_ARGS__);                    \
 110                 }                                               \
 111         } while(0)
 112
 113 #define LOCKF_DEBUGP(mask)                                      \
 114         ({                                                      \
 115                 ((mask) & lockf_debug);                         \
 116         })
 117 #else   /* !LOCKF_DEBUGGING */
 118 #define LOCKF_DEBUG(mask, ...)          /* mask */
 119 #endif  /* !LOCKF_DEBUGGING */
 120
 121 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 122
 123 #define NOLOCKF (struct lockf *)0
 124 #define SELF    0x1
 125 #define OTHERS  0x2
 126 #define OFF_MAX 0x7fffffffffffffffULL   /* max off_t */
 127
 128 /*
 129  * Overlapping lock states
 130  */
 131 typedef enum {
 132         OVERLAP_NONE = 0,
 133         OVERLAP_EQUALS_LOCK,
 134         OVERLAP_CONTAINS_LOCK,
 135         OVERLAP_CONTAINED_BY_LOCK,
 136         OVERLAP_STARTS_BEFORE_LOCK,
 137         OVERLAP_ENDS_AFTER_LOCK
 138 } overlap_t;
 139
 140 static int       lf_clearlock(struct lockf *);
 141 static overlap_t lf_findoverlap(struct lockf *,
 142     struct lockf *, int, struct lockf ***, struct lockf **);
 143 static struct lockf *lf_getblock(struct lockf *, pid_t);
 144 static int       lf_getlock(struct lockf *, struct flock *, pid_t);
 145 static int       lf_setlock(struct lockf *, struct timespec *);
 146 static int       lf_split(struct lockf *, struct lockf *);
 147 static void      lf_wakelock(struct lockf *, boolean_t);
 148 #if IMPORTANCE_INHERITANCE
 149 static void      lf_hold_assertion(task_t, struct lockf *);
 150 static void      lf_jump_to_queue_head(struct lockf *, struct lockf *);
 151 static void      lf_drop_assertion(struct lockf *);
 152 static void      lf_boost_blocking_proc(struct lockf *, struct lockf *);
 153 static void      lf_adjust_assertion(struct lockf *block);
 154 #endif /* IMPORTANCE_INHERITANCE */
 155
 156 /*
 157  * lf_advlock
 158  *
 159  * Description: Advisory record locking support
 160  *
 161  * Parameters:  ap                      Argument pointer to a vnop_advlock_args
 162  *                                      argument descriptor structure for the
 163  *                                      lock operation to be attempted.
 164  *
 165  * Returns:     0                       Success
 166  *              EOVERFLOW
 167  *              EINVAL
 168  *              ENOLCK                  Number of locked regions exceeds limit
 169  *      lf_setlock:EAGAIN
 170  *      lf_setlock:EDEADLK
 171  *      lf_setlock:EINTR
 172  *      lf_setlock:ENOLCK
 173  *      lf_setlock:ETIMEDOUT
 174  *      lf_clearlock:ENOLCK
 175  *      vnode_size:???
 176  *
 177  * Notes:       We return ENOLCK when we run out of memory to support locks; as
 178  *              such, there is no specific expectation limit other than the
 179  *              amount of available resources.
 180  */
 181 int
 182 lf_advlock(struct vnop_advlock_args *ap)
 183 {
 184         struct vnode *vp = ap->a_vp;
 185         struct flock *fl = ap->a_fl;
 186         vfs_context_t context = ap->a_context;
 187         struct lockf *lock;
 188         off_t start, end, oadd;
 189         u_quad_t size;
 190         int error;
 191         struct lockf **head = &vp->v_lockf;
 192
 193         /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
 194
 195         /*
 196          * Avoid the common case of unlocking when inode has no locks.
 197          */
 198         if (*head == (struct lockf *)0) {
 199                 if (ap->a_op != F_SETLK) {
 200                         fl->l_type = F_UNLCK;
 201                         LOCKF_DEBUG(LF_DBG_TRACE,
 202                             "lf_advlock: '%s' unlock without lock\n",
 203                             vfs_context_proc(context)->p_comm);
 204                         return 0;
 205                 }
 206         }
 207
 208         /*
 209          * Convert the flock structure into a start and end.
 210          */
 211         switch (fl->l_whence) {
 212         case SEEK_SET:
 213         case SEEK_CUR:
 214                 /*
 215                  * Caller is responsible for adding any necessary offset
 216                  * when SEEK_CUR is used.
 217                  */
 218                 start = fl->l_start;
 219                 break;
 220
 221         case SEEK_END:
 222
 223                 /*
 224                  * It's OK to cast the u_quad_t to and off_t here, since they
 225                  * are the same storage size, and the value of the returned
 226                  * contents will never overflow into the sign bit.  We need to
 227                  * do this because we will use size to force range checks.
 228                  */
 229                 if ((error = vnode_size(vp, (off_t *)&size, context))) {
 230                         LOCKF_DEBUG(LF_DBG_TRACE,
 231                             "lf_advlock: vnode_getattr failed: %d\n", error);
 232                         return error;
 233                 }
 234
 235                 if (size > OFF_MAX ||
 236                     (fl->l_start > 0 &&
 237                     size > (u_quad_t)(OFF_MAX - fl->l_start))) {
 238                         return EOVERFLOW;
 239                 }
 240                 start = size + fl->l_start;
 241                 break;
 242
 243         default:
 244                 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
 245                     fl->l_whence);
 246                 return EINVAL;
 247         }
 248         if (start < 0) {
 249                 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
 250                     start);
 251                 return EINVAL;
 252         }
 253         if (fl->l_len < 0) {
 254                 if (start == 0) {
 255                         LOCKF_DEBUG(LF_DBG_TRACE,
 256                             "lf_advlock: len < 0 & start == 0\n");
 257                         return EINVAL;
 258                 }
 259                 end = start - 1;
 260                 start += fl->l_len;
 261                 if (start < 0) {
 262                         LOCKF_DEBUG(LF_DBG_TRACE,
 263                             "lf_advlock: start < 0 (%qd)\n", start);
 264                         return EINVAL;
 265                 }
 266         } else if (fl->l_len == 0) {
 267                 end = -1;
 268         } else {
 269                 oadd = fl->l_len - 1;
 270                 if (oadd > (off_t)(OFF_MAX - start)) {
 271                         LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
 272                         return EOVERFLOW;
 273                 }
 274                 end = start + oadd;
 275         }
 276         /*
 277          * Create the lockf structure
 278          */
 279         MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 280         if (lock == NULL) {
 281                 return ENOLCK;
 282         }
 283         lock->lf_start = start;
 284         lock->lf_end = end;
 285         lock->lf_id = ap->a_id;
 286         lock->lf_vnode = vp;
 287         lock->lf_type = fl->l_type;
 288         lock->lf_head = head;
 289         lock->lf_next = (struct lockf *)0;
 290         TAILQ_INIT(&lock->lf_blkhd);
 291         lock->lf_flags = ap->a_flags;
 292 #if IMPORTANCE_INHERITANCE
 293         lock->lf_boosted = LF_NOT_BOOSTED;
 294 #endif
 295         if (ap->a_flags & F_POSIX) {
 296                 lock->lf_owner = (struct proc *)lock->lf_id;
 297         } else {
 298                 lock->lf_owner = NULL;
 299         }
 300
 301         if (ap->a_flags & F_FLOCK) {
 302                 lock->lf_flags |= F_WAKE1_SAFE;
 303         }
 304
 305         lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
 306         /*
 307          * Do the requested operation.
 308          */
 309         switch (ap->a_op) {
 310         case F_SETLK:
 311                 /*
 312                  * For F_OFD_* locks, lf_id is the fileglob.
 313                  * Record an "lf_owner" iff this is a confined fd
 314                  * i.e. it cannot escape this process and will be
 315                  * F_UNLCKed before the owner exits.  (This is
 316                  * the implicit guarantee needed to ensure lf_owner
 317                  * remains a valid reference here.)
 318                  */
 319                 if (ap->a_flags & F_OFD_LOCK) {
 320                         struct fileglob *fg = (void *)lock->lf_id;
 321                         if (fg->fg_lflags & FG_CONFINED) {
 322                                 lock->lf_owner = current_proc();
 323                         }
 324                 }
 325                 error = lf_setlock(lock, ap->a_timeout);
 326                 break;
 327
 328         case F_UNLCK:
 329                 error = lf_clearlock(lock);
 330                 FREE(lock, M_LOCKF);
 331                 break;
 332
 333         case F_GETLK:
 334                 error = lf_getlock(lock, fl, -1);
 335                 FREE(lock, M_LOCKF);
 336                 break;
 337
 338         case F_GETLKPID:
 339                 error = lf_getlock(lock, fl, fl->l_pid);
 340                 FREE(lock, M_LOCKF);
 341                 break;
 342
 343         default:
 344                 FREE(lock, M_LOCKF);
 345                 error = EINVAL;
 346                 break;
 347         }
 348         lck_mtx_unlock(&vp->v_lock);    /* done manipulating the list */
 349
 350         LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
 351         return error;
 352 }
 353
 354 /*
 355  * Empty the queue of msleeping requests for a lock on the given vnode.
 356  * Called with the vnode already locked.  Used for forced unmount, where
 357  * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
 358  * that prevents the vnode from ever being drained.  Force unmounting wins.
 359  */
 360 void
 361 lf_abort_advlocks(vnode_t vp)
 362 {
 363         struct lockf *lock;
 364
 365         if ((lock = vp->v_lockf) == NULL) {
 366                 return;
 367         }
 368
 369         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
 370
 371         if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 372                 struct lockf *tlock;
 373
 374                 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
 375                         /*
 376                          * Setting this flag should cause all
 377                          * currently blocked F_SETLK request to
 378                          * return to userland with an errno.
 379                          */
 380                         tlock->lf_flags |= F_ABORT;
 381                 }
 382                 lf_wakelock(lock, TRUE);
 383         }
 384 }
 385
 386 /*
 387  * Take any lock attempts which are currently blocked by a given lock ("from")
 388  * and mark them as blocked by a different lock ("to").  Used in the case
 389  * where a byte range currently occupied by "from" is to be occupied by "to."
 390  */
 391 static void
 392 lf_move_blocked(struct lockf *to, struct lockf *from)
 393 {
 394         struct lockf *tlock;
 395
 396         TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
 397                 tlock->lf_next = to;
 398         }
 399
 400         TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
 401 }
 402
 403 /*
 404  * lf_coalesce_adjacent
 405  *
 406  * Description: Helper function: when setting a lock, coalesce adjacent
 407  *              locks.  Needed because adjacent locks are not overlapping,
 408  *              but POSIX requires that they be coalesced.
 409  *
 410  * Parameters:  lock                    The new lock which may be adjacent
 411  *                                      to already locked regions, and which
 412  *                                      should therefore be coalesced with them
 413  *
 414  * Returns:     <void>
 415  */
 416 static void
 417 lf_coalesce_adjacent(struct lockf *lock)
 418 {
 419         struct lockf **lf = lock->lf_head;
 420
 421         while (*lf != NOLOCKF) {
 422                 /* reject locks that obviously could not be coalesced */
 423                 if ((*lf == lock) ||
 424                     ((*lf)->lf_id != lock->lf_id) ||
 425                     ((*lf)->lf_type != lock->lf_type)) {
 426                         lf = &(*lf)->lf_next;
 427                         continue;
 428                 }
 429
 430                 /*
 431                  * NOTE: Assumes that if two locks are adjacent on the number line
 432                  * and belong to the same owner, then they are adjacent on the list.
 433                  */
 434                 if ((*lf)->lf_end != -1 &&
 435                     ((*lf)->lf_end + 1) == lock->lf_start) {
 436                         struct lockf *adjacent = *lf;
 437
 438                         LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
 439                         lock->lf_start = (*lf)->lf_start;
 440                         *lf = lock;
 441                         lf = &(*lf)->lf_next;
 442
 443                         lf_move_blocked(lock, adjacent);
 444
 445                         FREE(adjacent, M_LOCKF);
 446                         continue;
 447                 }
 448                 /* If the lock starts adjacent to us, we can coalesce it */
 449                 if (lock->lf_end != -1 &&
 450                     (lock->lf_end + 1) == (*lf)->lf_start) {
 451                         struct lockf *adjacent = *lf;
 452
 453                         LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
 454                         lock->lf_end = (*lf)->lf_end;
 455                         lock->lf_next = (*lf)->lf_next;
 456                         lf = &lock->lf_next;
 457
 458                         lf_move_blocked(lock, adjacent);
 459
 460                         FREE(adjacent, M_LOCKF);
 461                         continue;
 462                 }
 463
 464                 /* no matching conditions; go on to next lock */
 465                 lf = &(*lf)->lf_next;
 466         }
 467 }
 468
 469 /*
 470  * lf_setlock
 471  *
 472  * Description: Set a byte-range lock.
 473  *
 474  * Parameters:  lock                    The lock structure describing the lock
 475  *                                      to be set; allocated by the caller, it
 476  *                                      will be linked into the lock list if
 477  *                                      the set is successful, and freed if the
 478  *                                      set is unsuccessful.
 479  *
 480  *              timeout                 Timeout specified in the case of
 481  *                                      SETLKWTIMEOUT.
 482  *
 483  * Returns:     0                       Success
 484  *              EAGAIN
 485  *              EDEADLK
 486  *      lf_split:ENOLCK
 487  *      lf_clearlock:ENOLCK
 488  *      msleep:EINTR
 489  *      msleep:ETIMEDOUT
 490  *
 491  * Notes:       We add the lock to the provisional lock list.  We do not
 492  *              coalesce at this time; this has implications for other lock
 493  *              requestors in the blocker search mechanism.
 494  */
 495 static int
 496 lf_setlock(struct lockf *lock, struct timespec *timeout)
 497 {
 498         struct lockf *block;
 499         struct lockf **head = lock->lf_head;
 500         struct lockf **prev, *overlap, *ltmp;
 501         static char lockstr[] = "lockf";
 502         int priority, needtolink, error;
 503         struct vnode *vp = lock->lf_vnode;
 504         overlap_t ovcase;
 505
 506 #ifdef LOCKF_DEBUGGING
 507         if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
 508                 lf_print("lf_setlock", lock);
 509                 lf_printlist("lf_setlock(in)", lock);
 510         }
 511 #endif /* LOCKF_DEBUGGING */
 512         LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
 513
 514         /*
 515          * Set the priority
 516          */
 517         priority = PLOCK;
 518         if (lock->lf_type == F_WRLCK) {
 519                 priority += 4;
 520         }
 521         priority |= PCATCH;
 522 scan:
 523         /*
 524          * Scan lock list for this file looking for locks that would block us.
 525          */
 526         while ((block = lf_getblock(lock, -1))) {
 527                 /*
 528                  * Free the structure and return if nonblocking.
 529                  */
 530                 if ((lock->lf_flags & F_WAIT) == 0) {
 531                         DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
 532                         FREE(lock, M_LOCKF);
 533                         return EAGAIN;
 534                 }
 535
 536                 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
 537
 538                 /*
 539                  * We are blocked. Since flock style locks cover
 540                  * the whole file, there is no chance for deadlock.
 541                  *
 542                  * OFD byte-range locks currently do NOT support
 543                  * deadlock detection.
 544                  *
 545                  * For POSIX byte-range locks we must check for deadlock.
 546                  *
 547                  * Deadlock detection is done by looking through the
 548                  * wait channels to see if there are any cycles that
 549                  * involve us.
 550                  */
 551                 if ((lock->lf_flags & F_POSIX) &&
 552                     (block->lf_flags & F_POSIX)) {
 553                         struct proc *wproc;
 554                         struct uthread *ut;
 555
 556                         /* The block is waiting on something */
 557                         wproc = block->lf_owner;
 558                         proc_lock(wproc);
 559                         LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
 560                         TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
 561                                 /*
 562                                  * If the thread is asleep (uu_wchan != 0)
 563                                  * in this code (uu_wmesg == lockstr)
 564                                  * check to see if the lock is blocked behind
 565                                  * someone blocked behind us.
 566                                  */
 567                                 if ((ut->uu_wchan != NULL) && (ut->uu_wmesg == lockstr)) {
 568                                         struct lockf *waitblock = (struct lockf *)ut->uu_wchan;
 569                                         LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
 570
 571                                         vnode_t othervp = NULL;
 572                                         if (waitblock->lf_vnode != vp) {
 573                                                 /*
 574                                                  * This thread in wproc is waiting for a lock
 575                                                  * on a different vnode; grab the lock on it
 576                                                  * that protects lf_next while we examine it.
 577                                                  */
 578                                                 othervp = waitblock->lf_vnode;
 579                                                 if (!lck_mtx_try_lock(&othervp->v_lock)) {
 580                                                         /*
 581                                                          * avoid kernel deadlock: drop all
 582                                                          * locks, pause for a bit to let the
 583                                                          * other thread do what it needs to do,
 584                                                          * then (because we drop and retake
 585                                                          * v_lock) retry the scan.
 586                                                          */
 587                                                         proc_unlock(wproc);
 588                                                         static struct timespec ts = {
 589                                                                 .tv_sec = 0,
 590                                                                 .tv_nsec = 10 * NSEC_PER_MSEC,
 591                                                         };
 592                                                         (void) msleep(lock, &vp->v_lock, priority, lockstr, &ts);
 593                                                         LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
 594                                                         goto scan;
 595                                                 }
 596                                         }
 597
 598                                         /*
 599                                          * Get the lock blocking the lock
 600                                          * which would block us, and make
 601                                          * certain it hasn't become unblocked
 602                                          * (been granted, e.g. between the time
 603                                          * we called lf_getblock, and the time
 604                                          * we successfully acquired the
 605                                          * proc_lock).
 606                                          */
 607                                         waitblock = waitblock->lf_next;
 608                                         if (waitblock == NULL) {
 609                                                 if (othervp) {
 610                                                         lck_mtx_unlock(&othervp->v_lock);
 611                                                 }
 612                                                 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with no lf_next\n", lock);
 613                                                 continue;
 614                                         }
 615                                         LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
 616
 617                                         /*
 618                                          * Make sure it's an advisory range
 619                                          * lock and not any other kind of lock;
 620                                          * if we mix lock types, it's our own
 621                                          * fault.
 622                                          */
 623                                         if ((waitblock->lf_flags & F_POSIX) == 0) {
 624                                                 if (othervp) {
 625                                                         lck_mtx_unlock(&othervp->v_lock);
 626                                                 }
 627                                                 continue;
 628                                         }
 629
 630                                         /*
 631                                          * If the owner of the lock that's
 632                                          * blocking a lock that's blocking us
 633                                          * getting the requested lock, then we
 634                                          * would deadlock, so error out.
 635                                          */
 636                                         struct proc *bproc = waitblock->lf_owner;
 637                                         const boolean_t deadlocked = bproc == lock->lf_owner;
 638
 639                                         if (othervp) {
 640                                                 lck_mtx_unlock(&othervp->v_lock);
 641                                         }
 642                                         LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
 643                                         if (deadlocked) {
 644                                                 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
 645                                                 proc_unlock(wproc);
 646                                                 FREE(lock, M_LOCKF);
 647                                                 return EDEADLK;
 648                                         }
 649                                 }
 650                                 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
 651                         }
 652                         proc_unlock(wproc);
 653                 }
 654
 655                 /*
 656                  * For flock type locks, we must first remove
 657                  * any shared locks that we hold before we sleep
 658                  * waiting for an exclusive lock.
 659                  */
 660                 if ((lock->lf_flags & F_FLOCK) &&
 661                     lock->lf_type == F_WRLCK) {
 662                         lock->lf_type = F_UNLCK;
 663                         if ((error = lf_clearlock(lock)) != 0) {
 664                                 FREE(lock, M_LOCKF);
 665                                 return error;
 666                         }
 667                         lock->lf_type = F_WRLCK;
 668                 }
 669                 /*
 670                  * Add our lock to the blocked list and sleep until we're free.
 671                  * Remember who blocked us (for deadlock detection).
 672                  */
 673                 lock->lf_next = block;
 674                 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
 675
 676                 if (!(lock->lf_flags & F_FLOCK)) {
 677                         block->lf_flags &= ~F_WAKE1_SAFE;
 678                 }
 679
 680 #if IMPORTANCE_INHERITANCE
 681                 /*
 682                  * Importance donation is done only for cases where the
 683                  * owning task can be unambiguously determined.
 684                  *
 685                  * POSIX type locks are not inherited by child processes;
 686                  * we maintain a 1:1 mapping between a lock and its owning
 687                  * process.
 688                  *
 689                  * Flock type locks are inherited across fork() and there is
 690                  * no 1:1 mapping in the general case.  However, the fileglobs
 691                  * used by OFD locks *may* be confined to the process that
 692                  * created them, and thus have an "owner", in which case
 693                  * we also attempt importance donation.
 694                  */
 695                 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0) {
 696                         lf_boost_blocking_proc(lock, block);
 697                 } else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
 698                     lock->lf_owner != block->lf_owner &&
 699                     NULL != lock->lf_owner && NULL != block->lf_owner) {
 700                         lf_boost_blocking_proc(lock, block);
 701                 }
 702 #endif /* IMPORTANCE_INHERITANCE */
 703
 704 #ifdef LOCKF_DEBUGGING
 705                 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
 706                         lf_print("lf_setlock: blocking on", block);
 707                         lf_printlist("lf_setlock(block)", block);
 708                 }
 709 #endif /* LOCKF_DEBUGGING */
 710                 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
 711
 712                 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
 713
 714                 if (error == 0 && (lock->lf_flags & F_ABORT) != 0) {
 715                         error = EBADF;
 716                 }
 717
 718                 if (lock->lf_next) {
 719                         /*
 720                          * lf_wakelock() always sets wakelock->lf_next to
 721                          * NULL before a wakeup; so we've been woken early
 722                          * - perhaps by a debugger, signal or other event.
 723                          *
 724                          * Remove 'lock' from the block list (avoids double-add
 725                          * in the spurious case, which would create a cycle)
 726                          */
 727                         TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
 728 #if IMPORTANCE_INHERITANCE
 729                         /*
 730                          * Adjust the boost on lf_next.
 731                          */
 732                         lf_adjust_assertion(lock->lf_next);
 733 #endif /* IMPORTANCE_INHERITANCE */
 734                         lock->lf_next = NULL;
 735
 736                         if (error == 0) {
 737                                 /*
 738                                  * If this was a spurious wakeup, retry
 739                                  */
 740                                 printf("%s: spurious wakeup, retrying lock\n",
 741                                     __func__);
 742                                 continue;
 743                         }
 744                 }
 745
 746                 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 747                         if ((block = lf_getblock(lock, -1)) != NULL) {
 748                                 lf_move_blocked(block, lock);
 749                         }
 750                 }
 751
 752                 if (error) {
 753                         if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 754                                 lf_wakelock(lock, TRUE);
 755                         }
 756                         FREE(lock, M_LOCKF);
 757                         /* Return ETIMEDOUT if timeout occoured. */
 758                         if (error == EWOULDBLOCK) {
 759                                 error = ETIMEDOUT;
 760                         }
 761                         return error;
 762                 }
 763         }
 764
 765         /*
 766          * No blocks!!  Add the lock.  Note that we will
 767          * downgrade or upgrade any overlapping locks this
 768          * process already owns.
 769          *
 770          * Skip over locks owned by other processes.
 771          * Handle any locks that overlap and are owned by ourselves.
 772          */
 773         prev = head;
 774         block = *head;
 775         needtolink = 1;
 776         for (;;) {
 777                 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
 778                 if (ovcase) {
 779                         block = overlap->lf_next;
 780                 }
 781                 /*
 782                  * Six cases:
 783                  *      0) no overlap
 784                  *      1) overlap == lock
 785                  *      2) overlap contains lock
 786                  *      3) lock contains overlap
 787                  *      4) overlap starts before lock
 788                  *      5) overlap ends after lock
 789                  */
 790                 switch (ovcase) {
 791                 case OVERLAP_NONE:
 792                         if (needtolink) {
 793                                 *prev = lock;
 794                                 lock->lf_next = overlap;
 795                         }
 796                         break;
 797
 798                 case OVERLAP_EQUALS_LOCK:
 799                         /*
 800                          * If downgrading lock, others may be
 801                          * able to acquire it.
 802                          */
 803                         if (lock->lf_type == F_RDLCK &&
 804                             overlap->lf_type == F_WRLCK) {
 805                                 lf_wakelock(overlap, TRUE);
 806                         }
 807                         overlap->lf_type = lock->lf_type;
 808                         FREE(lock, M_LOCKF);
 809                         lock = overlap; /* for lf_coalesce_adjacent() */
 810                         break;
 811
 812                 case OVERLAP_CONTAINS_LOCK:
 813                         /*
 814                          * Check for common starting point and different types.
 815                          */
 816                         if (overlap->lf_type == lock->lf_type) {
 817                                 FREE(lock, M_LOCKF);
 818                                 lock = overlap; /* for lf_coalesce_adjacent() */
 819                                 break;
 820                         }
 821                         if (overlap->lf_start == lock->lf_start) {
 822                                 *prev = lock;
 823                                 lock->lf_next = overlap;
 824                                 overlap->lf_start = lock->lf_end + 1;
 825                         } else {
 826                                 /*
 827                                  * If we can't split the lock, we can't
 828                                  * grant it.  Claim a system limit for the
 829                                  * resource shortage.
 830                                  */
 831                                 if (lf_split(overlap, lock)) {
 832                                         FREE(lock, M_LOCKF);
 833                                         return ENOLCK;
 834                                 }
 835                         }
 836                         lf_wakelock(overlap, TRUE);
 837                         break;
 838
 839                 case OVERLAP_CONTAINED_BY_LOCK:
 840                         /*
 841                          * If downgrading lock, others may be able to
 842                          * acquire it, otherwise take the list.
 843                          */
 844                         if (lock->lf_type == F_RDLCK &&
 845                             overlap->lf_type == F_WRLCK) {
 846                                 lf_wakelock(overlap, TRUE);
 847                         } else {
 848                                 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
 849                                         ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
 850                                         TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
 851                                             lf_block);
 852                                         TAILQ_INSERT_TAIL(&lock->lf_blkhd,
 853                                             ltmp, lf_block);
 854                                         ltmp->lf_next = lock;
 855                                 }
 856                         }
 857                         /*
 858                          * Add the new lock if necessary and delete the overlap.
 859                          */
 860                         if (needtolink) {
 861                                 *prev = lock;
 862                                 lock->lf_next = overlap->lf_next;
 863                                 prev = &lock->lf_next;
 864                                 needtolink = 0;
 865                         } else {
 866                                 *prev = overlap->lf_next;
 867                         }
 868                         FREE(overlap, M_LOCKF);
 869                         continue;
 870
 871                 case OVERLAP_STARTS_BEFORE_LOCK:
 872                         /*
 873                          * Add lock after overlap on the list.
 874                          */
 875                         lock->lf_next = overlap->lf_next;
 876                         overlap->lf_next = lock;
 877                         overlap->lf_end = lock->lf_start - 1;
 878                         prev = &lock->lf_next;
 879                         lf_wakelock(overlap, TRUE);
 880                         needtolink = 0;
 881                         continue;
 882
 883                 case OVERLAP_ENDS_AFTER_LOCK:
 884                         /*
 885                          * Add the new lock before overlap.
 886                          */
 887                         if (needtolink) {
 888                                 *prev = lock;
 889                                 lock->lf_next = overlap;
 890                         }
 891                         overlap->lf_start = lock->lf_end + 1;
 892                         lf_wakelock(overlap, TRUE);
 893                         break;
 894                 }
 895                 break;
 896         }
 897         /* Coalesce adjacent locks with identical attributes */
 898         lf_coalesce_adjacent(lock);
 899 #ifdef LOCKF_DEBUGGING
 900         if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
 901                 lf_print("lf_setlock: got the lock", lock);
 902                 lf_printlist("lf_setlock(out)", lock);
 903         }
 904 #endif /* LOCKF_DEBUGGING */
 905         return 0;
 906 }
 907
 908
 909 /*
 910  * lf_clearlock
 911  *
 912  * Description: Remove a byte-range lock on an vnode.  Generally, find the
 913  *              lock (or an overlap to that lock) and remove it (or shrink
 914  *              it), then wakeup anyone we can.
 915  *
 916  * Parameters:  unlock                  The lock to clear
 917  *
 918  * Returns:     0                       Success
 919  *      lf_split:ENOLCK
 920  *
 921  * Notes:       A caller may unlock all the locks owned by the caller by
 922  *              specifying the entire file range; locks owned by other
 923  *              callers are not effected by this operation.
 924  */
 925 static int
 926 lf_clearlock(struct lockf *unlock)
 927 {
 928         struct lockf **head = unlock->lf_head;
 929         struct lockf *lf = *head;
 930         struct lockf *overlap, **prev;
 931         overlap_t ovcase;
 932
 933         if (lf == NOLOCKF) {
 934                 return 0;
 935         }
 936 #ifdef LOCKF_DEBUGGING
 937         if (unlock->lf_type != F_UNLCK) {
 938                 panic("lf_clearlock: bad type");
 939         }
 940         if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
 941                 lf_print("lf_clearlock", unlock);
 942         }
 943 #endif /* LOCKF_DEBUGGING */
 944         prev = head;
 945         while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
 946                 /*
 947                  * Wakeup the list of locks to be retried.
 948                  */
 949                 lf_wakelock(overlap, FALSE);
 950 #if IMPORTANCE_INHERITANCE
 951                 if (overlap->lf_boosted == LF_BOOSTED) {
 952                         lf_drop_assertion(overlap);
 953                 }
 954 #endif /* IMPORTANCE_INHERITANCE */
 955
 956                 switch (ovcase) {
 957                 case OVERLAP_NONE:      /* satisfy compiler enum/switch */
 958                         break;
 959
 960                 case OVERLAP_EQUALS_LOCK:
 961                         *prev = overlap->lf_next;
 962                         FREE(overlap, M_LOCKF);
 963                         break;
 964
 965                 case OVERLAP_CONTAINS_LOCK: /* split it */
 966                         if (overlap->lf_start == unlock->lf_start) {
 967                                 overlap->lf_start = unlock->lf_end + 1;
 968                                 break;
 969                         }
 970                         /*
 971                          * If we can't split the lock, we can't grant it.
 972                          * Claim a system limit for the resource shortage.
 973                          */
 974                         if (lf_split(overlap, unlock)) {
 975                                 return ENOLCK;
 976                         }
 977                         overlap->lf_next = unlock->lf_next;
 978                         break;
 979
 980                 case OVERLAP_CONTAINED_BY_LOCK:
 981                         *prev = overlap->lf_next;
 982                         lf = overlap->lf_next;
 983                         FREE(overlap, M_LOCKF);
 984                         continue;
 985
 986                 case OVERLAP_STARTS_BEFORE_LOCK:
 987                         overlap->lf_end = unlock->lf_start - 1;
 988                         prev = &overlap->lf_next;
 989                         lf = overlap->lf_next;
 990                         continue;
 991
 992                 case OVERLAP_ENDS_AFTER_LOCK:
 993                         overlap->lf_start = unlock->lf_end + 1;
 994                         break;
 995                 }
 996                 break;
 997         }
 998 #ifdef LOCKF_DEBUGGING
 999         if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1000                 lf_printlist("lf_clearlock", unlock);
1001         }
1002 #endif /* LOCKF_DEBUGGING */
1003         return 0;
1004 }
1005
1006
1007 /*
1008  * lf_getlock
1009  *
1010  * Description: Check whether there is a blocking lock, and if so return
1011  *              its process identifier into the lock being requested.
1012  *
1013  * Parameters:  lock                    Pointer to lock to test for blocks
1014  *              fl                      Pointer to flock structure to receive
1015  *                                      the blocking lock information, if a
1016  *                                      blocking lock is found.
1017  *              matchpid                -1, or pid value to match in lookup.
1018  *
1019  * Returns:     0                       Success
1020  *
1021  * Implicit Returns:
1022  *              *fl                     Contents modified to reflect the
1023  *                                      blocking lock, if one is found; not
1024  *                                      modified otherwise
1025  *
1026  * Notes:       fl->l_pid will be (-1) for file locks and will only be set to
1027  *              the blocking process ID for advisory record locks.
1028  */
1029 static int
1030 lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
1031 {
1032         struct lockf *block;
1033
1034 #ifdef LOCKF_DEBUGGING
1035         if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1036                 lf_print("lf_getlock", lock);
1037         }
1038 #endif /* LOCKF_DEBUGGING */
1039
1040         if ((block = lf_getblock(lock, matchpid))) {
1041                 fl->l_type = block->lf_type;
1042                 fl->l_whence = SEEK_SET;
1043                 fl->l_start = block->lf_start;
1044                 if (block->lf_end == -1) {
1045                         fl->l_len = 0;
1046                 } else {
1047                         fl->l_len = block->lf_end - block->lf_start + 1;
1048                 }
1049                 if (NULL != block->lf_owner) {
1050                         /*
1051                          * lf_owner is only non-NULL when the lock
1052                          * "owner" can be unambiguously determined
1053                          */
1054                         fl->l_pid = proc_pid(block->lf_owner);
1055                 } else {
1056                         fl->l_pid = -1;
1057                 }
1058         } else {
1059                 fl->l_type = F_UNLCK;
1060         }
1061         return 0;
1062 }
1063
1064 /*
1065  * lf_getblock
1066  *
1067  * Description: Walk the list of locks for an inode and return the first
1068  *              blocking lock.  A lock is considered blocking if we are not
1069  *              the lock owner; otherwise, we are permitted to upgrade or
1070  *              downgrade it, and it's not considered blocking.
1071  *
1072  * Parameters:  lock                    The lock for which we are interested
1073  *                                      in obtaining the blocking lock, if any
1074  *              matchpid                -1, or pid value to match in lookup.
1075  *
1076  * Returns:     NOLOCKF                 No blocking lock exists
1077  *              !NOLOCKF                The address of the blocking lock's
1078  *                                      struct lockf.
1079  */
1080 static struct lockf *
1081 lf_getblock(struct lockf *lock, pid_t matchpid)
1082 {
1083         struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1084
1085         for (prev = lock->lf_head;
1086             lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1087             lf = overlap->lf_next) {
1088                 /*
1089                  * Found an overlap.
1090                  *
1091                  * If we're matching pids, and it's a record lock,
1092                  * or it's an OFD lock on a process-confined fd,
1093                  * but the pid doesn't match, then keep on looking ..
1094                  */
1095                 if (matchpid != -1 &&
1096                     (overlap->lf_flags & (F_POSIX | F_OFD_LOCK)) != 0 &&
1097                     proc_pid(overlap->lf_owner) != matchpid) {
1098                         continue;
1099                 }
1100
1101                 /*
1102                  * does it block us?
1103                  */
1104                 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) {
1105                         return overlap;
1106                 }
1107         }
1108         return NOLOCKF;
1109 }
1110
1111
1112 /*
1113  * lf_findoverlap
1114  *
1115  * Description: Walk the list of locks to find an overlapping lock (if any).
1116  *
1117  * Parameters:  lf                      First lock on lock list
1118  *              lock                    The lock we are checking for an overlap
1119  *              check                   Check type
1120  *              prev                    pointer to pointer pointer to contain
1121  *                                      address of pointer to previous lock
1122  *                                      pointer to overlapping lock, if overlap
1123  *              overlap                 pointer to pointer to contain address
1124  *                                      of overlapping lock
1125  *
1126  * Returns:     OVERLAP_NONE
1127  *              OVERLAP_EQUALS_LOCK
1128  *              OVERLAP_CONTAINS_LOCK
1129  *              OVERLAP_CONTAINED_BY_LOCK
1130  *              OVERLAP_STARTS_BEFORE_LOCK
1131  *              OVERLAP_ENDS_AFTER_LOCK
1132  *
1133  * Implicit Returns:
1134  *              *prev                   The address of the next pointer in the
1135  *                                      lock previous to the overlapping lock;
1136  *                                      this is generally used to relink the
1137  *                                      lock list, avoiding a second iteration.
1138  *              *overlap                The pointer to the overlapping lock
1139  *                                      itself; this is used to return data in
1140  *                                      the check == OTHERS case, and for the
1141  *                                      caller to modify the overlapping lock,
1142  *                                      in the check == SELF case
1143  *
1144  * Note:        This returns only the FIRST overlapping lock.  There may be
1145  *              more than one.  lf_getlock will return the first blocking lock,
1146  *              while lf_setlock will iterate over all overlapping locks to
1147  *
1148  *              The check parameter can be SELF, meaning we are looking for
1149  *              overlapping locks owned by us, or it can be OTHERS, meaning
1150  *              we are looking for overlapping locks owned by someone else so
1151  *              we can report a blocking lock on an F_GETLK request.
1152  *
1153  *              The value of *overlap and *prev are modified, even if there is
1154  *              no overlapping lock found; always check the return code.
1155  */
1156 static overlap_t
1157 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1158     struct lockf ***prev, struct lockf **overlap)
1159 {
1160         off_t start, end;
1161         int found_self = 0;
1162
1163         *overlap = lf;
1164         if (lf == NOLOCKF) {
1165                 return 0;
1166         }
1167 #ifdef LOCKF_DEBUGGING
1168         if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1169                 lf_print("lf_findoverlap: looking for overlap in", lock);
1170         }
1171 #endif /* LOCKF_DEBUGGING */
1172         start = lock->lf_start;
1173         end = lock->lf_end;
1174         while (lf != NOLOCKF) {
1175                 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1176                     ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1177                         /*
1178                          * Locks belonging to one process are adjacent on the
1179                          * list, so if we've found any locks belonging to us,
1180                          * and we're now seeing something else, then we've
1181                          * examined all "self" locks.  Note that bailing out
1182                          * here is quite important; for coalescing, we assume
1183                          * numerically adjacent locks from the same owner to
1184                          * be adjacent on the list.
1185                          */
1186                         if ((type & SELF) && found_self) {
1187                                 return OVERLAP_NONE;
1188                         }
1189
1190                         *prev = &lf->lf_next;
1191                         *overlap = lf = lf->lf_next;
1192                         continue;
1193                 }
1194
1195                 if ((type & SELF)) {
1196                         found_self = 1;
1197                 }
1198
1199 #ifdef LOCKF_DEBUGGING
1200                 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1201                         lf_print("\tchecking", lf);
1202                 }
1203 #endif /* LOCKF_DEBUGGING */
1204                 /*
1205                  * OK, check for overlap
1206                  */
1207                 if ((lf->lf_end != -1 && start > lf->lf_end) ||
1208                     (end != -1 && lf->lf_start > end)) {
1209                         /* Case 0 */
1210                         LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1211
1212                         /*
1213                          * NOTE: assumes that locks for the same process are
1214                          * nonintersecting and ordered.
1215                          */
1216                         if ((type & SELF) && end != -1 && lf->lf_start > end) {
1217                                 return OVERLAP_NONE;
1218                         }
1219                         *prev = &lf->lf_next;
1220                         *overlap = lf = lf->lf_next;
1221                         continue;
1222                 }
1223                 if ((lf->lf_start == start) && (lf->lf_end == end)) {
1224                         LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1225                         return OVERLAP_EQUALS_LOCK;
1226                 }
1227                 if ((lf->lf_start <= start) &&
1228                     (end != -1) &&
1229                     ((lf->lf_end >= end) || (lf->lf_end == -1))) {
1230                         LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1231                         return OVERLAP_CONTAINS_LOCK;
1232                 }
1233                 if (start <= lf->lf_start &&
1234                     (end == -1 ||
1235                     (lf->lf_end != -1 && end >= lf->lf_end))) {
1236                         LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1237                         return OVERLAP_CONTAINED_BY_LOCK;
1238                 }
1239                 if ((lf->lf_start < start) &&
1240                     ((lf->lf_end >= start) || (lf->lf_end == -1))) {
1241                         LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1242                         return OVERLAP_STARTS_BEFORE_LOCK;
1243                 }
1244                 if ((lf->lf_start > start) &&
1245                     (end != -1) &&
1246                     ((lf->lf_end > end) || (lf->lf_end == -1))) {
1247                         LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1248                         return OVERLAP_ENDS_AFTER_LOCK;
1249                 }
1250                 panic("lf_findoverlap: default");
1251         }
1252         return OVERLAP_NONE;
1253 }
1254
1255
1256 /*
1257  * lf_split
1258  *
1259  * Description: Split a lock and a contained region into two or three locks
1260  *              as necessary.
1261  *
1262  * Parameters:  lock1                   Lock to split
1263  *              lock2                   Overlapping lock region requiring the
1264  *                                      split (upgrade/downgrade/unlock)
1265  *
1266  * Returns:     0                       Success
1267  *              ENOLCK                  No memory for new lock
1268  *
1269  * Implicit Returns:
1270  *              *lock1                  Modified original lock
1271  *              *lock2                  Overlapping lock (inserted into list)
1272  *              (new lock)              Potential new lock inserted into list
1273  *                                      if split results in 3 locks
1274  *
1275  * Notes:       This operation can only fail if the split would result in three
1276  *              locks, and there is insufficient memory to allocate the third
1277  *              lock; in that case, neither of the locks will be modified.
1278  */
1279 static int
1280 lf_split(struct lockf *lock1, struct lockf *lock2)
1281 {
1282         struct lockf *splitlock;
1283
1284 #ifdef LOCKF_DEBUGGING
1285         if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1286                 lf_print("lf_split", lock1);
1287                 lf_print("splitting from", lock2);
1288         }
1289 #endif /* LOCKF_DEBUGGING */
1290         /*
1291          * Check to see if splitting into only two pieces.
1292          */
1293         if (lock1->lf_start == lock2->lf_start) {
1294                 lock1->lf_start = lock2->lf_end + 1;
1295                 lock2->lf_next = lock1;
1296                 return 0;
1297         }
1298         if (lock1->lf_end == lock2->lf_end) {
1299                 lock1->lf_end = lock2->lf_start - 1;
1300                 lock2->lf_next = lock1->lf_next;
1301                 lock1->lf_next = lock2;
1302                 return 0;
1303         }
1304         /*
1305          * Make a new lock consisting of the last part of
1306          * the encompassing lock
1307          */
1308         MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1309         if (splitlock == NULL) {
1310                 return ENOLCK;
1311         }
1312         bcopy(lock1, splitlock, sizeof *splitlock);
1313         splitlock->lf_start = lock2->lf_end + 1;
1314         TAILQ_INIT(&splitlock->lf_blkhd);
1315         lock1->lf_end = lock2->lf_start - 1;
1316         /*
1317          * OK, now link it in
1318          */
1319         splitlock->lf_next = lock1->lf_next;
1320         lock2->lf_next = splitlock;
1321         lock1->lf_next = lock2;
1322
1323         return 0;
1324 }
1325
1326
1327 /*
1328  * lf_wakelock
1329  *
1330  * Wakeup a blocklist in the case of a downgrade or unlock, since others
1331  * waiting on the lock may now be able to acquire it.
1332  *
1333  * Parameters:  listhead                Lock list head on which waiters may
1334  *                                      have pending locks
1335  *
1336  * Returns:     <void>
1337  *
1338  * Notes:       This function iterates a list of locks and wakes all waiters,
1339  *              rather than only waiters for the contended regions.  Because
1340  *              of this, for heavily contended files, this can result in a
1341  *              "thundering herd" situation.  Refactoring the code could make
1342  *              this operation more efficient, if heavy contention ever results
1343  *              in a real-world performance problem.
1344  */
1345 static void
1346 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1347 {
1348         struct lockf *wakelock;
1349         boolean_t wake_all = TRUE;
1350
1351         if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
1352                 wake_all = FALSE;
1353         }
1354
1355         while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1356                 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1357                 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1358
1359                 wakelock->lf_next = NOLOCKF;
1360 #ifdef LOCKF_DEBUGGING
1361                 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1362                         lf_print("lf_wakelock: awakening", wakelock);
1363                 }
1364 #endif /* LOCKF_DEBUGGING */
1365                 if (wake_all == FALSE) {
1366                         /*
1367                          * If there are items on the list head block list,
1368                          * move them to the wakelock list instead, and then
1369                          * correct their lf_next pointers.
1370                          */
1371                         if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1372                                 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1373
1374                                 struct lockf *tlock;
1375
1376                                 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1377                                         if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1378                                                 /* See rdar://10887303 */
1379                                                 panic("cycle in wakelock list");
1380                                         }
1381                                         tlock->lf_next = wakelock;
1382                                 }
1383                         }
1384                 }
1385                 wakeup(wakelock);
1386
1387                 if (wake_all == FALSE) {
1388                         break;
1389                 }
1390         }
1391 }
1392
1393
1394 #ifdef LOCKF_DEBUGGING
1395 #define GET_LF_OWNER_PID(lf)    (proc_pid((lf)->lf_owner))
1396
1397 /*
1398  * lf_print DEBUG
1399  *
1400  * Print out a lock; lock information is prefixed by the string in 'tag'
1401  *
1402  * Parameters:  tag                     A string tag for debugging
1403  *              lock                    The lock whose information should be
1404  *                                      displayed
1405  *
1406  * Returns:     <void>
1407  */
1408 void
1409 lf_print(const char *tag, struct lockf *lock)
1410 {
1411         printf("%s: lock %p for ", tag, (void *)lock);
1412         if (lock->lf_flags & F_POSIX) {
1413                 printf("proc %p (owner %d)",
1414                     lock->lf_id, GET_LF_OWNER_PID(lock));
1415         } else if (lock->lf_flags & F_OFD_LOCK) {
1416                 printf("fg %p (owner %d)",
1417                     lock->lf_id, GET_LF_OWNER_PID(lock));
1418         } else {
1419                 printf("id %p", (void *)lock->lf_id);
1420         }
1421         if (lock->lf_vnode != 0) {
1422                 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1423                     lock->lf_vnode,
1424                     lock->lf_type == F_RDLCK ? "shared" :
1425                     lock->lf_type == F_WRLCK ? "exclusive" :
1426                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1427                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1428         } else {
1429                 printf(" %s, start 0x%016llx, end 0x%016llx",
1430                     lock->lf_type == F_RDLCK ? "shared" :
1431                     lock->lf_type == F_WRLCK ? "exclusive" :
1432                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1433                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1434         }
1435         if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
1436                 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1437         } else {
1438                 printf("\n");
1439         }
1440 }
1441
1442
1443 /*
1444  * lf_printlist DEBUG
1445  *
1446  * Print out a lock list for the vnode associated with 'lock'; lock information
1447  * is prefixed by the string in 'tag'
1448  *
1449  * Parameters:  tag                     A string tag for debugging
1450  *              lock                    The lock whose vnode's lock list should
1451  *                                      be displayed
1452  *
1453  * Returns:     <void>
1454  */
1455 void
1456 lf_printlist(const char *tag, struct lockf *lock)
1457 {
1458         struct lockf *lf, *blk;
1459
1460         if (lock->lf_vnode == 0) {
1461                 return;
1462         }
1463
1464         printf("%s: Lock list for vno %p:\n",
1465             tag, lock->lf_vnode);
1466         for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1467                 printf("\tlock %p for ", (void *)lf);
1468                 if (lf->lf_flags & F_POSIX) {
1469                         printf("proc %p (owner %d)",
1470                             lf->lf_id, GET_LF_OWNER_PID(lf));
1471                 } else if (lf->lf_flags & F_OFD_LOCK) {
1472                         printf("fg %p (owner %d)",
1473                             lf->lf_id, GET_LF_OWNER_PID(lf));
1474                 } else {
1475                         printf("id %p", (void *)lf->lf_id);
1476                 }
1477                 printf(", %s, start 0x%016llx, end 0x%016llx",
1478                     lf->lf_type == F_RDLCK ? "shared" :
1479                     lf->lf_type == F_WRLCK ? "exclusive" :
1480                     lf->lf_type == F_UNLCK ? "unlock" :
1481                     "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1482                 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1483                         printf("\n\t\tlock request %p for ", (void *)blk);
1484                         if (blk->lf_flags & F_POSIX) {
1485                                 printf("proc %p (owner %d)",
1486                                     blk->lf_id, GET_LF_OWNER_PID(blk));
1487                         } else if (blk->lf_flags & F_OFD_LOCK) {
1488                                 printf("fg %p (owner %d)",
1489                                     blk->lf_id, GET_LF_OWNER_PID(blk));
1490                         } else {
1491                                 printf("id %p", (void *)blk->lf_id);
1492                         }
1493                         printf(", %s, start 0x%016llx, end 0x%016llx",
1494                             blk->lf_type == F_RDLCK ? "shared" :
1495                             blk->lf_type == F_WRLCK ? "exclusive" :
1496                             blk->lf_type == F_UNLCK ? "unlock" :
1497                             "unknown", (intmax_t)blk->lf_start,
1498                             (intmax_t)blk->lf_end);
1499                         if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
1500                                 panic("lf_printlist: bad list");
1501                         }
1502                 }
1503                 printf("\n");
1504         }
1505 }
1506 #endif /* LOCKF_DEBUGGING */
1507
1508 #if IMPORTANCE_INHERITANCE
1509
1510 /*
1511  * lf_hold_assertion
1512  *
1513  * Call task importance hold assertion on the owner of the lock.
1514  *
1515  * Parameters: block_task               Owner of the lock blocking
1516  *                                      current thread.
1517  *
1518  *             block                    lock on which the current thread
1519  *                                      is blocking on.
1520  *
1521  * Returns:    <void>
1522  *
1523  * Notes: The task reference on block_task is not needed to be hold since
1524  *        the current thread has vnode lock and block_task has a file
1525  *        lock, thus removing file lock in exit requires block_task to
1526  *        grab the vnode lock.
1527  */
1528 static void
1529 lf_hold_assertion(task_t block_task, struct lockf *block)
1530 {
1531         if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
1532                 block->lf_boosted = LF_BOOSTED;
1533                 LOCKF_DEBUG(LF_DBG_IMPINH,
1534                     "lf: importance hold file lock assert on pid %d lock %p\n",
1535                     proc_pid(block->lf_owner), block);
1536         }
1537 }
1538
1539
1540 /*
1541  * lf_jump_to_queue_head
1542  *
1543  * Jump the lock from the tail of the block queue to the head of
1544  * the queue.
1545  *
1546  * Parameters: block                    lockf struct containing the
1547  *                                      block queue.
1548  *             lock                     lockf struct to be jumped to the
1549  *                                      front.
1550  *
1551  * Returns:    <void>
1552  */
1553 static void
1554 lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1555 {
1556         /* Move the lock to the head of the block queue. */
1557         TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1558         TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1559 }
1560
1561
1562 /*
1563  * lf_drop_assertion
1564  *
1565  * Drops the task hold assertion.
1566  *
1567  * Parameters: block                    lockf struct holding the assertion.
1568  *
1569  * Returns:    <void>
1570  */
1571 static void
1572 lf_drop_assertion(struct lockf *block)
1573 {
1574         LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1575             proc_pid(block->lf_owner), block);
1576
1577         task_t current_task = proc_task(block->lf_owner);
1578         task_importance_drop_file_lock_assertion(current_task, 1);
1579         block->lf_boosted = LF_NOT_BOOSTED;
1580 }
1581
1582 /*
1583  * lf_adjust_assertion
1584  *
1585  * Adjusts importance assertion of file lock. Goes through
1586  * all the blocking locks and checks if the file lock needs
1587  * to be boosted anymore.
1588  *
1589  * Parameters: block    lockf structure which needs to be adjusted.
1590  *
1591  * Returns:     <void>
1592  */
1593 static void
1594 lf_adjust_assertion(struct lockf *block)
1595 {
1596         boolean_t drop_boost = TRUE;
1597         struct lockf *next;
1598
1599         /* Return if the lock is not boosted */
1600         if (block->lf_boosted == LF_NOT_BOOSTED) {
1601                 return;
1602         }
1603
1604         TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1605                 /* Check if block and next are same type of locks */
1606                 if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
1607                     ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1608                     (block->lf_owner != next->lf_owner) &&
1609                     (NULL != block->lf_owner && NULL != next->lf_owner))) {
1610                         /* Check if next would be boosting block */
1611                         if (task_is_importance_donor(proc_task(next->lf_owner)) &&
1612                             task_is_importance_receiver_type(proc_task(block->lf_owner))) {
1613                                 /* Found a lock boosting block */
1614                                 drop_boost = FALSE;
1615                                 break;
1616                         }
1617                 }
1618         }
1619
1620         if (drop_boost) {
1621                 lf_drop_assertion(block);
1622         }
1623 }
1624
1625 static void
1626 lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1627 {
1628         task_t ltask = proc_task(lock->lf_owner);
1629         task_t btask = proc_task(block->lf_owner);
1630
1631         /*
1632          * Check if ltask can donate importance. The
1633          * check of imp_donor bit is done without holding
1634          * any lock. The value may change after you read it,
1635          * but it is ok to boost a task while someone else is
1636          * unboosting you.
1637          *
1638          * TODO: Support live inheritance on file locks.
1639          */
1640         if (task_is_importance_donor(ltask)) {
1641                 LOCKF_DEBUG(LF_DBG_IMPINH,
1642                     "lf: %d: attempt to boost pid %d that holds lock %p\n",
1643                     proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1644
1645                 if (block->lf_boosted != LF_BOOSTED &&
1646                     task_is_importance_receiver_type(btask)) {
1647                         lf_hold_assertion(btask, block);
1648                 }
1649                 lf_jump_to_queue_head(block, lock);
1650         }
1651 }
1652 #endif /* IMPORTANCE_INHERITANCE */