bsd/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2015 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from software contributed to Berkeley by
  33  * Scooter Morris at Genentech Inc.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  60  */
  61
  62 #include <sys/cdefs.h>
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/lock.h>
  67 #include <sys/mount.h>
  68 #include <sys/proc.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/unistd.h>
  71 #include <sys/user.h>
  72 #include <sys/vnode.h>
  73 #include <sys/vnode_internal.h>
  74 #include <sys/vnode_if.h>
  75 #include <sys/malloc.h>
  76 #include <sys/fcntl.h>
  77 #include <sys/lockf.h>
  78 #include <sys/sdt.h>
  79 #include <kern/policy_internal.h>
  80
  81 #include <sys/file_internal.h>
  82
  83 /*
  84  * This variable controls the maximum number of processes that will
  85  * be checked in doing deadlock detection.
  86  */
  87 static int maxlockdepth = MAXDEPTH;
  88
  89 #if (DEVELOPMENT || DEBUG)
  90 #define LOCKF_DEBUGGING 1
  91 #endif
  92
  93 #ifdef LOCKF_DEBUGGING
  94 #include <sys/sysctl.h>
  95 void lf_print(const char *tag, struct lockf *lock);
  96 void lf_printlist(const char *tag, struct lockf *lock);
  97
  98 #define LF_DBG_LOCKOP   (1 << 0)        /* setlk, getlk, clearlk */
  99 #define LF_DBG_LIST     (1 << 1)        /* split, coalesce */
 100 #define LF_DBG_IMPINH   (1 << 2)        /* importance inheritance */
 101 #define LF_DBG_TRACE    (1 << 3)        /* errors, exit */
 102
 103 static int      lockf_debug = 0;        /* was 2, could be 3 ;-) */
 104 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
 105
 106 /*
 107  * If there is no mask bit selector, or there is one, and the selector is
 108  * set, then output the debugging diagnostic.
 109  */
 110 #define LOCKF_DEBUG(mask, ...)                                  \
 111         do {                                                    \
 112                 if( !(mask) || ((mask) & lockf_debug)) {        \
 113                         printf(__VA_ARGS__);                    \
 114                 }                                               \
 115         } while(0)
 116 #else   /* !LOCKF_DEBUGGING */
 117 #define LOCKF_DEBUG(mask, ...)          /* mask */
 118 #endif  /* !LOCKF_DEBUGGING */
 119
 120 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 121
 122 #define NOLOCKF (struct lockf *)0
 123 #define SELF    0x1
 124 #define OTHERS  0x2
 125 #define OFF_MAX 0x7fffffffffffffffULL   /* max off_t */
 126
 127 /*
 128  * Overlapping lock states
 129  */
 130 typedef enum {
 131         OVERLAP_NONE = 0,
 132         OVERLAP_EQUALS_LOCK,
 133         OVERLAP_CONTAINS_LOCK,
 134         OVERLAP_CONTAINED_BY_LOCK,
 135         OVERLAP_STARTS_BEFORE_LOCK,
 136         OVERLAP_ENDS_AFTER_LOCK
 137 } overlap_t;
 138
 139 static int       lf_clearlock(struct lockf *);
 140 static overlap_t lf_findoverlap(struct lockf *,
 141             struct lockf *, int, struct lockf ***, struct lockf **);
 142 static struct lockf *lf_getblock(struct lockf *, pid_t);
 143 static int       lf_getlock(struct lockf *, struct flock *, pid_t);
 144 static int       lf_setlock(struct lockf *, struct timespec *);
 145 static int       lf_split(struct lockf *, struct lockf *);
 146 static void      lf_wakelock(struct lockf *, boolean_t);
 147 #if IMPORTANCE_INHERITANCE
 148 static void      lf_hold_assertion(task_t, struct lockf *);
 149 static void      lf_jump_to_queue_head(struct lockf *, struct lockf *);
 150 static void      lf_drop_assertion(struct lockf *);
 151 static void      lf_boost_blocking_proc(struct lockf *, struct lockf *);
 152 static void      lf_adjust_assertion(struct lockf *block);
 153 #endif /* IMPORTANCE_INHERITANCE */
 154
 155 /*
 156  * lf_advlock
 157  *
 158  * Description: Advisory record locking support
 159  *
 160  * Parameters:  ap                      Argument pointer to a vnop_advlock_args
 161  *                                      argument descriptor structure for the
 162  *                                      lock operation to be attempted.
 163  *
 164  * Returns:     0                       Success
 165  *              EOVERFLOW
 166  *              EINVAL
 167  *              ENOLCK                  Number of locked regions exceeds limit
 168  *      lf_setlock:EAGAIN
 169  *      lf_setlock:EDEADLK
 170  *      lf_setlock:EINTR
 171  *      lf_setlock:ENOLCK
 172  *      lf_setlock:ETIMEDOUT
 173  *      lf_clearlock:ENOLCK
 174  *      vnode_size:???
 175  *
 176  * Notes:       We return ENOLCK when we run out of memory to support locks; as
 177  *              such, there is no specific expectation limit other than the
 178  *              amount of available resources.
 179  */
 180 int
 181 lf_advlock(struct vnop_advlock_args *ap)
 182 {
 183         struct vnode *vp = ap->a_vp;
 184         struct flock *fl = ap->a_fl;
 185         vfs_context_t context = ap->a_context;
 186         struct lockf *lock;
 187         off_t start, end, oadd;
 188         u_quad_t size;
 189         int error;
 190         struct lockf **head = &vp->v_lockf;
 191
 192         /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
 193
 194         /*
 195          * Avoid the common case of unlocking when inode has no locks.
 196          */
 197         if (*head == (struct lockf *)0) {
 198                 if (ap->a_op != F_SETLK) {
 199                         fl->l_type = F_UNLCK;
 200                         LOCKF_DEBUG(LF_DBG_TRACE,
 201                             "lf_advlock: '%s' unlock without lock\n",
 202                             vfs_context_proc(context)->p_comm);
 203                         return (0);
 204                 }
 205         }
 206
 207         /*
 208          * Convert the flock structure into a start and end.
 209          */
 210         switch (fl->l_whence) {
 211
 212         case SEEK_SET:
 213         case SEEK_CUR:
 214                 /*
 215                  * Caller is responsible for adding any necessary offset
 216                  * when SEEK_CUR is used.
 217                  */
 218                 start = fl->l_start;
 219                 break;
 220
 221         case SEEK_END:
 222
 223                 /*
 224                  * It's OK to cast the u_quad_t to and off_t here, since they
 225                  * are the same storage size, and the value of the returned
 226                  * contents will never overflow into the sign bit.  We need to
 227                  * do this because we will use size to force range checks.
 228                  */
 229                 if ((error = vnode_size(vp, (off_t *)&size, context))) {
 230                         LOCKF_DEBUG(LF_DBG_TRACE,
 231                             "lf_advlock: vnode_getattr failed: %d\n", error);
 232                         return (error);
 233                 }
 234
 235                 if (size > OFF_MAX ||
 236                     (fl->l_start > 0 &&
 237                      size > (u_quad_t)(OFF_MAX - fl->l_start)))
 238                         return (EOVERFLOW);
 239                 start = size + fl->l_start;
 240                 break;
 241
 242         default:
 243                 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
 244                     fl->l_whence);
 245                 return (EINVAL);
 246         }
 247         if (start < 0) {
 248                 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
 249                     start);
 250                 return (EINVAL);
 251         }
 252         if (fl->l_len < 0) {
 253                 if (start == 0) {
 254                         LOCKF_DEBUG(LF_DBG_TRACE,
 255                             "lf_advlock: len < 0 & start == 0\n");
 256                         return (EINVAL);
 257                 }
 258                 end = start - 1;
 259                 start += fl->l_len;
 260                 if (start < 0) {
 261                         LOCKF_DEBUG(LF_DBG_TRACE,
 262                             "lf_advlock: start < 0 (%qd)\n", start);
 263                         return (EINVAL);
 264                 }
 265         } else if (fl->l_len == 0)
 266                 end = -1;
 267         else {
 268                 oadd = fl->l_len - 1;
 269                 if (oadd > (off_t)(OFF_MAX - start)) {
 270                         LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
 271                         return (EOVERFLOW);
 272                 }
 273                 end = start + oadd;
 274         }
 275         /*
 276          * Create the lockf structure
 277          */
 278         MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 279         if (lock == NULL)
 280                 return (ENOLCK);
 281         lock->lf_start = start;
 282         lock->lf_end = end;
 283         lock->lf_id = ap->a_id;
 284         lock->lf_vnode = vp;
 285         lock->lf_type = fl->l_type;
 286         lock->lf_head = head;
 287         lock->lf_next = (struct lockf *)0;
 288         TAILQ_INIT(&lock->lf_blkhd);
 289         lock->lf_flags = ap->a_flags;
 290 #if IMPORTANCE_INHERITANCE
 291         lock->lf_boosted = LF_NOT_BOOSTED;
 292 #endif
 293         if (ap->a_flags & F_POSIX)
 294                 lock->lf_owner = (struct proc *)lock->lf_id;
 295         else
 296                 lock->lf_owner = NULL;
 297
 298         if (ap->a_flags & F_FLOCK)
 299                 lock->lf_flags |= F_WAKE1_SAFE;
 300
 301         lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
 302         /*
 303          * Do the requested operation.
 304          */
 305         switch(ap->a_op) {
 306         case F_SETLK:
 307                 /*
 308                  * For F_OFD_* locks, lf_id is the fileglob.
 309                  * Record an "lf_owner" iff this is a confined fd
 310                  * i.e. it cannot escape this process and will be
 311                  * F_UNLCKed before the owner exits.  (This is
 312                  * the implicit guarantee needed to ensure lf_owner
 313                  * remains a valid reference here.)
 314                  */
 315                 if (ap->a_flags & F_OFD_LOCK) {
 316                         struct fileglob *fg = (void *)lock->lf_id;
 317                         if (fg->fg_lflags & FG_CONFINED)
 318                                 lock->lf_owner = current_proc();
 319                 }
 320                 error = lf_setlock(lock, ap->a_timeout);
 321                 break;
 322
 323         case F_UNLCK:
 324                 error = lf_clearlock(lock);
 325                 FREE(lock, M_LOCKF);
 326                 break;
 327
 328         case F_GETLK:
 329                 error = lf_getlock(lock, fl, -1);
 330                 FREE(lock, M_LOCKF);
 331                 break;
 332
 333
 334         default:
 335                 FREE(lock, M_LOCKF);
 336                 error = EINVAL;
 337                 break;
 338         }
 339         lck_mtx_unlock(&vp->v_lock);    /* done manipulating the list */
 340
 341         LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
 342         return (error);
 343 }
 344
 345 /*
 346  * Empty the queue of msleeping requests for a lock on the given vnode.
 347  * Called with the vnode already locked.  Used for forced unmount, where
 348  * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
 349  * that prevents the vnode from ever being drained.  Force unmounting wins.
 350  */
 351 void
 352 lf_abort_advlocks(vnode_t vp)
 353 {
 354         struct lockf *lock;
 355
 356         if ((lock = vp->v_lockf) == NULL)
 357                 return;
 358
 359         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
 360
 361         if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 362                 struct lockf *tlock;
 363
 364                 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
 365                         /*
 366                          * Setting this flag should cause all
 367                          * currently blocked F_SETLK request to
 368                          * return to userland with an errno.
 369                          */
 370                         tlock->lf_flags |= F_ABORT;
 371                 }
 372                 lf_wakelock(lock, TRUE);
 373         }
 374 }
 375
 376 /*
 377  * Take any lock attempts which are currently blocked by a given lock ("from")
 378  * and mark them as blocked by a different lock ("to").  Used in the case
 379  * where a byte range currently occupied by "from" is to be occupied by "to."
 380  */
 381 static void
 382 lf_move_blocked(struct lockf *to, struct lockf *from)
 383 {
 384         struct lockf *tlock;
 385
 386         TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
 387                 tlock->lf_next = to;
 388         }
 389
 390         TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
 391 }
 392
 393 /*
 394  * lf_coalesce_adjacent
 395  *
 396  * Description: Helper function: when setting a lock, coalesce adjacent
 397  *              locks.  Needed because adjacent locks are not overlapping,
 398  *              but POSIX requires that they be coalesced.
 399  *
 400  * Parameters:  lock                    The new lock which may be adjacent
 401  *                                      to already locked regions, and which
 402  *                                      should therefore be coalesced with them
 403  *
 404  * Returns:     <void>
 405  */
 406 static void
 407 lf_coalesce_adjacent(struct lockf *lock)
 408 {
 409         struct lockf **lf = lock->lf_head;
 410
 411         while (*lf != NOLOCKF) {
 412                 /* reject locks that obviously could not be coalesced */
 413                 if ((*lf == lock) ||
 414                     ((*lf)->lf_id != lock->lf_id) ||
 415                     ((*lf)->lf_type != lock->lf_type)) {
 416                         lf = &(*lf)->lf_next;
 417                         continue;
 418                 }
 419
 420                 /*
 421                  * NOTE: Assumes that if two locks are adjacent on the number line
 422                  * and belong to the same owner, then they are adjacent on the list.
 423                  */
 424                 if ((*lf)->lf_end != -1 &&
 425                     ((*lf)->lf_end + 1) == lock->lf_start) {
 426                         struct lockf *adjacent = *lf;
 427
 428                         LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
 429                         lock->lf_start = (*lf)->lf_start;
 430                         *lf = lock;
 431                         lf = &(*lf)->lf_next;
 432
 433                         lf_move_blocked(lock, adjacent);
 434
 435                         FREE(adjacent, M_LOCKF);
 436                         continue;
 437                 }
 438                 /* If the lock starts adjacent to us, we can coalesce it */
 439                 if (lock->lf_end != -1 &&
 440                     (lock->lf_end + 1) == (*lf)->lf_start) {
 441                         struct lockf *adjacent = *lf;
 442
 443                         LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
 444                         lock->lf_end = (*lf)->lf_end;
 445                         lock->lf_next = (*lf)->lf_next;
 446                         lf = &lock->lf_next;
 447
 448                         lf_move_blocked(lock, adjacent);
 449
 450                         FREE(adjacent, M_LOCKF);
 451                         continue;
 452                 }
 453
 454                 /* no matching conditions; go on to next lock */
 455                 lf = &(*lf)->lf_next;
 456         }
 457 }
 458
 459 /*
 460  * lf_setlock
 461  *
 462  * Description: Set a byte-range lock.
 463  *
 464  * Parameters:  lock                    The lock structure describing the lock
 465  *                                      to be set; allocated by the caller, it
 466  *                                      will be linked into the lock list if
 467  *                                      the set is successful, and freed if the
 468  *                                      set is unsuccessful.
 469  *
 470  *              timeout                 Timeout specified in the case of
 471  *                                      SETLKWTIMEOUT.
 472  *
 473  * Returns:     0                       Success
 474  *              EAGAIN
 475  *              EDEADLK
 476  *      lf_split:ENOLCK
 477  *      lf_clearlock:ENOLCK
 478  *      msleep:EINTR
 479  *      msleep:ETIMEDOUT
 480  *
 481  * Notes:       We add the lock to the provisional lock list.  We do not
 482  *              coalesce at this time; this has implications for other lock
 483  *              requestors in the blocker search mechanism.
 484  */
 485 static int
 486 lf_setlock(struct lockf *lock, struct timespec *timeout)
 487 {
 488         struct lockf *block;
 489         struct lockf **head = lock->lf_head;
 490         struct lockf **prev, *overlap, *ltmp;
 491         static char lockstr[] = "lockf";
 492         int priority, needtolink, error;
 493         struct vnode *vp = lock->lf_vnode;
 494         overlap_t ovcase;
 495
 496 #ifdef LOCKF_DEBUGGING
 497         if (lockf_debug & LF_DBG_LOCKOP) {
 498                 lf_print("lf_setlock", lock);
 499                 lf_printlist("lf_setlock(in)", lock);
 500         }
 501 #endif /* LOCKF_DEBUGGING */
 502
 503         /*
 504          * Set the priority
 505          */
 506         priority = PLOCK;
 507         if (lock->lf_type == F_WRLCK)
 508                 priority += 4;
 509         priority |= PCATCH;
 510         /*
 511          * Scan lock list for this file looking for locks that would block us.
 512          */
 513         while ((block = lf_getblock(lock, -1))) {
 514                 /*
 515                  * Free the structure and return if nonblocking.
 516                  */
 517                 if ((lock->lf_flags & F_WAIT) == 0) {
 518                         DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
 519                         FREE(lock, M_LOCKF);
 520                         return (EAGAIN);
 521                 }
 522
 523                 /*
 524                  * We are blocked. Since flock style locks cover
 525                  * the whole file, there is no chance for deadlock.
 526                  *
 527                  * OFD byte-range locks currently do NOT support
 528                  * deadlock detection.
 529                  *
 530                  * For POSIX byte-range locks we must check for deadlock.
 531                  *
 532                  * Deadlock detection is done by looking through the
 533                  * wait channels to see if there are any cycles that
 534                  * involve us. MAXDEPTH is set just to make sure we
 535                  * do not go off into neverland.
 536                  */
 537                 if ((lock->lf_flags & F_POSIX) &&
 538                     (block->lf_flags & F_POSIX)) {
 539                         struct proc *wproc, *bproc;
 540                         struct uthread *ut;
 541                         struct lockf *waitblock;
 542                         int i = 0;
 543
 544                         /* The block is waiting on something */
 545                         wproc = block->lf_owner;
 546                         proc_lock(wproc);
 547                         TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
 548                                 /*
 549                                  * While the thread is asleep (uu_wchan != 0)
 550                                  * in this code (uu_wmesg == lockstr)
 551                                  * and we have not exceeded the maximum cycle
 552                                  * depth (i < maxlockdepth), then check for a
 553                                  * cycle to see if the lock is blocked behind
 554                                  * someone blocked behind us.
 555                                  */
 556                                 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
 557                                     ut->uu_wmesg == lockstr &&
 558                                     (i++ < maxlockdepth)) {
 559                                         waitblock = (struct lockf *)ut->uu_wchan;
 560                                         /*
 561                                          * Get the lock blocking the lock
 562                                          * which would block us, and make
 563                                          * certain it hasn't come unblocked
 564                                          * (been granted, e.g. between the time
 565                                          * we called lf_getblock, and the time
 566                                          * we successfully acquired the
 567                                          * proc_lock).
 568                                          */
 569                                         waitblock = waitblock->lf_next;
 570                                         if (waitblock == NULL)
 571                                                 break;
 572
 573                                         /*
 574                                          * Make sure it's an advisory range
 575                                          * lock and not any other kind of lock;
 576                                          * if we mix lock types, it's our own
 577                                          * fault.
 578                                          */
 579                                         if ((waitblock->lf_flags & F_POSIX) == 0)
 580                                                 break;
 581
 582                                         /*
 583                                          * If the owner of the lock that's
 584                                          * blocking a lock that's blocking us
 585                                          * getting the requested lock, then we
 586                                          * would deadlock, so error out.
 587                                          */
 588                                         bproc = waitblock->lf_owner;
 589                                         if (bproc == lock->lf_owner) {
 590                                                 proc_unlock(wproc);
 591                                                 FREE(lock, M_LOCKF);
 592                                                 return (EDEADLK);
 593                                         }
 594                                 }
 595                         }
 596                         proc_unlock(wproc);
 597                 }
 598
 599                 /*
 600                  * For flock type locks, we must first remove
 601                  * any shared locks that we hold before we sleep
 602                  * waiting for an exclusive lock.
 603                  */
 604                 if ((lock->lf_flags & F_FLOCK) &&
 605                     lock->lf_type == F_WRLCK) {
 606                         lock->lf_type = F_UNLCK;
 607                         if ((error = lf_clearlock(lock)) != 0) {
 608                                 FREE(lock, M_LOCKF);
 609                                 return (error);
 610                         }
 611                         lock->lf_type = F_WRLCK;
 612                 }
 613                 /*
 614                  * Add our lock to the blocked list and sleep until we're free.
 615                  * Remember who blocked us (for deadlock detection).
 616                  */
 617                 lock->lf_next = block;
 618                 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
 619
 620                 if ( !(lock->lf_flags & F_FLOCK))
 621                         block->lf_flags &= ~F_WAKE1_SAFE;
 622
 623 #if IMPORTANCE_INHERITANCE
 624                 /*
 625                  * Importance donation is done only for cases where the
 626                  * owning task can be unambiguously determined.
 627                  *
 628                  * POSIX type locks are not inherited by child processes;
 629                  * we maintain a 1:1 mapping between a lock and its owning
 630                  * process.
 631                  *
 632                  * Flock type locks are inherited across fork() and there is
 633                  * no 1:1 mapping in the general case.  However, the fileglobs
 634                  * used by OFD locks *may* be confined to the process that
 635                  * created them, and thus have an "owner", in which case
 636                  * we also attempt importance donation.
 637                  */
 638                 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0)
 639                         lf_boost_blocking_proc(lock, block);
 640                 else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
 641                     lock->lf_owner != block->lf_owner &&
 642                     NULL != lock->lf_owner && NULL != block->lf_owner)
 643                         lf_boost_blocking_proc(lock, block);
 644 #endif /* IMPORTANCE_INHERITANCE */
 645
 646 #ifdef LOCKF_DEBUGGING
 647                 if (lockf_debug & LF_DBG_LOCKOP) {
 648                         lf_print("lf_setlock: blocking on", block);
 649                         lf_printlist("lf_setlock(block)", block);
 650                 }
 651 #endif /* LOCKF_DEBUGGING */
 652                 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
 653
 654                 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
 655
 656                 if (error == 0 && (lock->lf_flags & F_ABORT) != 0)
 657                         error = EBADF;
 658
 659                 if (lock->lf_next) {
 660                         /*
 661                          * lf_wakelock() always sets wakelock->lf_next to
 662                          * NULL before a wakeup; so we've been woken early
 663                          * - perhaps by a debugger, signal or other event.
 664                          *
 665                          * Remove 'lock' from the block list (avoids double-add
 666                          * in the spurious case, which would create a cycle)
 667                          */
 668                         TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
 669 #if IMPORTANCE_INHERITANCE
 670                         /*
 671                          * Adjust the boost on lf_next.
 672                          */
 673                         lf_adjust_assertion(lock->lf_next);
 674 #endif /* IMPORTANCE_INHERITANCE */
 675                         lock->lf_next = NULL;
 676
 677                         if (error == 0) {
 678                                 /*
 679                                  * If this was a spurious wakeup, retry
 680                                  */
 681                                 printf("%s: spurious wakeup, retrying lock\n",
 682                                     __func__);
 683                                 continue;
 684                         }
 685                 }
 686
 687                 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 688                         if ((block = lf_getblock(lock, -1)) != NULL)
 689                                 lf_move_blocked(block, lock);
 690                 }
 691
 692                 if (error) {
 693                         if (!TAILQ_EMPTY(&lock->lf_blkhd))
 694                                 lf_wakelock(lock, TRUE);
 695                         FREE(lock, M_LOCKF);
 696                         /* Return ETIMEDOUT if timeout occoured. */
 697                         if (error == EWOULDBLOCK) {
 698                                 error = ETIMEDOUT;
 699                         }
 700                         return (error);
 701                 }
 702         }
 703
 704         /*
 705          * No blocks!!  Add the lock.  Note that we will
 706          * downgrade or upgrade any overlapping locks this
 707          * process already owns.
 708          *
 709          * Skip over locks owned by other processes.
 710          * Handle any locks that overlap and are owned by ourselves.
 711          */
 712         prev = head;
 713         block = *head;
 714         needtolink = 1;
 715         for (;;) {
 716                 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
 717                 if (ovcase)
 718                         block = overlap->lf_next;
 719                 /*
 720                  * Six cases:
 721                  *      0) no overlap
 722                  *      1) overlap == lock
 723                  *      2) overlap contains lock
 724                  *      3) lock contains overlap
 725                  *      4) overlap starts before lock
 726                  *      5) overlap ends after lock
 727                  */
 728                 switch (ovcase) {
 729                 case OVERLAP_NONE:
 730                         if (needtolink) {
 731                                 *prev = lock;
 732                                 lock->lf_next = overlap;
 733                         }
 734                         break;
 735
 736                 case OVERLAP_EQUALS_LOCK:
 737                         /*
 738                          * If downgrading lock, others may be
 739                          * able to acquire it.
 740                          */
 741                         if (lock->lf_type == F_RDLCK &&
 742                             overlap->lf_type == F_WRLCK)
 743                                 lf_wakelock(overlap, TRUE);
 744                         overlap->lf_type = lock->lf_type;
 745                         FREE(lock, M_LOCKF);
 746                         lock = overlap; /* for lf_coalesce_adjacent() */
 747                         break;
 748
 749                 case OVERLAP_CONTAINS_LOCK:
 750                         /*
 751                          * Check for common starting point and different types.
 752                          */
 753                         if (overlap->lf_type == lock->lf_type) {
 754                                 FREE(lock, M_LOCKF);
 755                                 lock = overlap; /* for lf_coalesce_adjacent() */
 756                                 break;
 757                         }
 758                         if (overlap->lf_start == lock->lf_start) {
 759                                 *prev = lock;
 760                                 lock->lf_next = overlap;
 761                                 overlap->lf_start = lock->lf_end + 1;
 762                         } else {
 763                                 /*
 764                                  * If we can't split the lock, we can't
 765                                  * grant it.  Claim a system limit for the
 766                                  * resource shortage.
 767                                  */
 768                                 if (lf_split(overlap, lock)) {
 769                                         FREE(lock, M_LOCKF);
 770                                         return (ENOLCK);
 771                                 }
 772                         }
 773                         lf_wakelock(overlap, TRUE);
 774                         break;
 775
 776                 case OVERLAP_CONTAINED_BY_LOCK:
 777                         /*
 778                          * If downgrading lock, others may be able to
 779                          * acquire it, otherwise take the list.
 780                          */
 781                         if (lock->lf_type == F_RDLCK &&
 782                             overlap->lf_type == F_WRLCK) {
 783                                 lf_wakelock(overlap, TRUE);
 784                         } else {
 785                                 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
 786                                         ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
 787                                         TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
 788                                             lf_block);
 789                                         TAILQ_INSERT_TAIL(&lock->lf_blkhd,
 790                                             ltmp, lf_block);
 791                                         ltmp->lf_next = lock;
 792                                 }
 793                         }
 794                         /*
 795                          * Add the new lock if necessary and delete the overlap.
 796                          */
 797                         if (needtolink) {
 798                                 *prev = lock;
 799                                 lock->lf_next = overlap->lf_next;
 800                                 prev = &lock->lf_next;
 801                                 needtolink = 0;
 802                         } else
 803                                 *prev = overlap->lf_next;
 804                         FREE(overlap, M_LOCKF);
 805                         continue;
 806
 807                 case OVERLAP_STARTS_BEFORE_LOCK:
 808                         /*
 809                          * Add lock after overlap on the list.
 810                          */
 811                         lock->lf_next = overlap->lf_next;
 812                         overlap->lf_next = lock;
 813                         overlap->lf_end = lock->lf_start - 1;
 814                         prev = &lock->lf_next;
 815                         lf_wakelock(overlap, TRUE);
 816                         needtolink = 0;
 817                         continue;
 818
 819                 case OVERLAP_ENDS_AFTER_LOCK:
 820                         /*
 821                          * Add the new lock before overlap.
 822                          */
 823                         if (needtolink) {
 824                                 *prev = lock;
 825                                 lock->lf_next = overlap;
 826                         }
 827                         overlap->lf_start = lock->lf_end + 1;
 828                         lf_wakelock(overlap, TRUE);
 829                         break;
 830                 }
 831                 break;
 832         }
 833         /* Coalesce adjacent locks with identical attributes */
 834         lf_coalesce_adjacent(lock);
 835 #ifdef LOCKF_DEBUGGING
 836         if (lockf_debug & LF_DBG_LOCKOP) {
 837                 lf_print("lf_setlock: got the lock", lock);
 838                 lf_printlist("lf_setlock(out)", lock);
 839         }
 840 #endif /* LOCKF_DEBUGGING */
 841         return (0);
 842 }
 843
 844
 845 /*
 846  * lf_clearlock
 847  *
 848  * Description: Remove a byte-range lock on an vnode.  Generally, find the
 849  *              lock (or an overlap to that lock) and remove it (or shrink
 850  *              it), then wakeup anyone we can.
 851  *
 852  * Parameters:  unlock                  The lock to clear
 853  *
 854  * Returns:     0                       Success
 855  *      lf_split:ENOLCK
 856  *
 857  * Notes:       A caller may unlock all the locks owned by the caller by
 858  *              specifying the entire file range; locks owned by other
 859  *              callers are not effected by this operation.
 860  */
 861 static int
 862 lf_clearlock(struct lockf *unlock)
 863 {
 864         struct lockf **head = unlock->lf_head;
 865         struct lockf *lf = *head;
 866         struct lockf *overlap, **prev;
 867         overlap_t ovcase;
 868
 869         if (lf == NOLOCKF)
 870                 return (0);
 871 #ifdef LOCKF_DEBUGGING
 872         if (unlock->lf_type != F_UNLCK)
 873                 panic("lf_clearlock: bad type");
 874         if (lockf_debug & LF_DBG_LOCKOP)
 875                 lf_print("lf_clearlock", unlock);
 876 #endif /* LOCKF_DEBUGGING */
 877         prev = head;
 878         while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
 879                 /*
 880                  * Wakeup the list of locks to be retried.
 881                  */
 882                 lf_wakelock(overlap, FALSE);
 883 #if IMPORTANCE_INHERITANCE
 884                 if (overlap->lf_boosted == LF_BOOSTED) {
 885                         lf_drop_assertion(overlap);
 886                 }
 887 #endif /* IMPORTANCE_INHERITANCE */
 888
 889                 switch (ovcase) {
 890                 case OVERLAP_NONE:      /* satisfy compiler enum/switch */
 891                         break;
 892
 893                 case OVERLAP_EQUALS_LOCK:
 894                         *prev = overlap->lf_next;
 895                         FREE(overlap, M_LOCKF);
 896                         break;
 897
 898                 case OVERLAP_CONTAINS_LOCK: /* split it */
 899                         if (overlap->lf_start == unlock->lf_start) {
 900                                 overlap->lf_start = unlock->lf_end + 1;
 901                                 break;
 902                         }
 903                         /*
 904                          * If we can't split the lock, we can't grant it.
 905                          * Claim a system limit for the resource shortage.
 906                          */
 907                         if (lf_split(overlap, unlock))
 908                                 return (ENOLCK);
 909                         overlap->lf_next = unlock->lf_next;
 910                         break;
 911
 912                 case OVERLAP_CONTAINED_BY_LOCK:
 913                         *prev = overlap->lf_next;
 914                         lf = overlap->lf_next;
 915                         FREE(overlap, M_LOCKF);
 916                         continue;
 917
 918                 case OVERLAP_STARTS_BEFORE_LOCK:
 919                         overlap->lf_end = unlock->lf_start - 1;
 920                         prev = &overlap->lf_next;
 921                         lf = overlap->lf_next;
 922                         continue;
 923
 924                 case OVERLAP_ENDS_AFTER_LOCK:
 925                         overlap->lf_start = unlock->lf_end + 1;
 926                         break;
 927                 }
 928                 break;
 929         }
 930 #ifdef LOCKF_DEBUGGING
 931         if (lockf_debug & LF_DBG_LOCKOP)
 932                 lf_printlist("lf_clearlock", unlock);
 933 #endif /* LOCKF_DEBUGGING */
 934         return (0);
 935 }
 936
 937
 938 /*
 939  * lf_getlock
 940  *
 941  * Description: Check whether there is a blocking lock, and if so return
 942  *              its process identifier into the lock being requested.
 943  *
 944  * Parameters:  lock                    Pointer to lock to test for blocks
 945  *              fl                      Pointer to flock structure to receive
 946  *                                      the blocking lock information, if a
 947  *                                      blocking lock is found.
 948  *              matchpid                -1, or pid value to match in lookup.
 949  *
 950  * Returns:     0                       Success
 951  *
 952  * Implicit Returns:
 953  *              *fl                     Contents modified to reflect the
 954  *                                      blocking lock, if one is found; not
 955  *                                      modified otherwise
 956  *
 957  * Notes:       fl->l_pid will be (-1) for file locks and will only be set to
 958  *              the blocking process ID for advisory record locks.
 959  */
 960 static int
 961 lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
 962 {
 963         struct lockf *block;
 964
 965 #ifdef LOCKF_DEBUGGING
 966         if (lockf_debug & LF_DBG_LOCKOP)
 967                 lf_print("lf_getlock", lock);
 968 #endif /* LOCKF_DEBUGGING */
 969
 970         if ((block = lf_getblock(lock, matchpid))) {
 971                 fl->l_type = block->lf_type;
 972                 fl->l_whence = SEEK_SET;
 973                 fl->l_start = block->lf_start;
 974                 if (block->lf_end == -1)
 975                         fl->l_len = 0;
 976                 else
 977                         fl->l_len = block->lf_end - block->lf_start + 1;
 978                 if (NULL != block->lf_owner) {
 979                         /*
 980                          * lf_owner is only non-NULL when the lock
 981                          * "owner" can be unambiguously determined
 982                          */
 983                         fl->l_pid = proc_pid(block->lf_owner);
 984                 } else
 985                         fl->l_pid = -1;
 986         } else {
 987                 fl->l_type = F_UNLCK;
 988         }
 989         return (0);
 990 }
 991
 992 /*
 993  * lf_getblock
 994  *
 995  * Description: Walk the list of locks for an inode and return the first
 996  *              blocking lock.  A lock is considered blocking if we are not
 997  *              the lock owner; otherwise, we are permitted to upgrade or
 998  *              downgrade it, and it's not considered blocking.
 999  *
1000  * Parameters:  lock                    The lock for which we are interested
1001  *                                      in obtaining the blocking lock, if any
1002  *              matchpid                -1, or pid value to match in lookup.
1003  *
1004  * Returns:     NOLOCKF                 No blocking lock exists
1005  *              !NOLOCKF                The address of the blocking lock's
1006  *                                      struct lockf.
1007  */
1008 static struct lockf *
1009 lf_getblock(struct lockf *lock, pid_t matchpid)
1010 {
1011         struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1012
1013         for (prev = lock->lf_head;
1014             lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1015             lf = overlap->lf_next) {
1016                 /*
1017                  * Found an overlap.
1018                  *
1019                  * If we're matching pids, and it's a record lock,
1020                  * or it's an OFD lock on a process-confined fd,
1021                  * but the pid doesn't match, then keep on looking ..
1022                  */
1023                 if (matchpid != -1 &&
1024                     (overlap->lf_flags & (F_POSIX|F_OFD_LOCK)) != 0 &&
1025                     proc_pid(overlap->lf_owner) != matchpid)
1026                         continue;
1027
1028                 /*
1029                  * does it block us?
1030                  */
1031                 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
1032                         return (overlap);
1033         }
1034         return (NOLOCKF);
1035 }
1036
1037
1038 /*
1039  * lf_findoverlap
1040  *
1041  * Description: Walk the list of locks to find an overlapping lock (if any).
1042  *
1043  * Parameters:  lf                      First lock on lock list
1044  *              lock                    The lock we are checking for an overlap
1045  *              check                   Check type
1046  *              prev                    pointer to pointer pointer to contain
1047  *                                      address of pointer to previous lock
1048  *                                      pointer to overlapping lock, if overlap
1049  *              overlap                 pointer to pointer to contain address
1050  *                                      of overlapping lock
1051  *
1052  * Returns:     OVERLAP_NONE
1053  *              OVERLAP_EQUALS_LOCK
1054  *              OVERLAP_CONTAINS_LOCK
1055  *              OVERLAP_CONTAINED_BY_LOCK
1056  *              OVERLAP_STARTS_BEFORE_LOCK
1057  *              OVERLAP_ENDS_AFTER_LOCK
1058  *
1059  * Implicit Returns:
1060  *              *prev                   The address of the next pointer in the
1061  *                                      lock previous to the overlapping lock;
1062  *                                      this is generally used to relink the
1063  *                                      lock list, avoiding a second iteration.
1064  *              *overlap                The pointer to the overlapping lock
1065  *                                      itself; this is used to return data in
1066  *                                      the check == OTHERS case, and for the
1067  *                                      caller to modify the overlapping lock,
1068  *                                      in the check == SELF case
1069  *
1070  * Note:        This returns only the FIRST overlapping lock.  There may be
1071  *              more than one.  lf_getlock will return the first blocking lock,
1072  *              while lf_setlock will iterate over all overlapping locks to
1073  *
1074  *              The check parameter can be SELF, meaning we are looking for
1075  *              overlapping locks owned by us, or it can be OTHERS, meaning
1076  *              we are looking for overlapping locks owned by someone else so
1077  *              we can report a blocking lock on an F_GETLK request.
1078  *
1079  *              The value of *overlap and *prev are modified, even if there is
1080  *              no overlapping lock found; always check the return code.
1081  */
1082 static overlap_t
1083 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1084                struct lockf ***prev, struct lockf **overlap)
1085 {
1086         off_t start, end;
1087         int found_self = 0;
1088
1089         *overlap = lf;
1090         if (lf == NOLOCKF)
1091                 return (0);
1092 #ifdef LOCKF_DEBUGGING
1093         if (lockf_debug & LF_DBG_LIST)
1094                 lf_print("lf_findoverlap: looking for overlap in", lock);
1095 #endif /* LOCKF_DEBUGGING */
1096         start = lock->lf_start;
1097         end = lock->lf_end;
1098         while (lf != NOLOCKF) {
1099                 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1100                     ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1101                         /*
1102                          * Locks belonging to one process are adjacent on the
1103                          * list, so if we've found any locks belonging to us,
1104                          * and we're now seeing something else, then we've
1105                          * examined all "self" locks.  Note that bailing out
1106                          * here is quite important; for coalescing, we assume
1107                          * numerically adjacent locks from the same owner to
1108                          * be adjacent on the list.
1109                          */
1110                         if ((type & SELF) && found_self) {
1111                                 return OVERLAP_NONE;
1112                         }
1113
1114                         *prev = &lf->lf_next;
1115                         *overlap = lf = lf->lf_next;
1116                         continue;
1117                 }
1118
1119                 if ((type & SELF)) {
1120                         found_self = 1;
1121                 }
1122
1123 #ifdef LOCKF_DEBUGGING
1124                 if (lockf_debug & LF_DBG_LIST)
1125                         lf_print("\tchecking", lf);
1126 #endif /* LOCKF_DEBUGGING */
1127                 /*
1128                  * OK, check for overlap
1129                  */
1130                 if ((lf->lf_end != -1 && start > lf->lf_end) ||
1131                     (end != -1 && lf->lf_start > end)) {
1132                         /* Case 0 */
1133                         LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1134
1135                         /*
1136                          * NOTE: assumes that locks for the same process are
1137                          * nonintersecting and ordered.
1138                          */
1139                         if ((type & SELF) && end != -1 && lf->lf_start > end)
1140                                 return (OVERLAP_NONE);
1141                         *prev = &lf->lf_next;
1142                         *overlap = lf = lf->lf_next;
1143                         continue;
1144                 }
1145                 if ((lf->lf_start == start) && (lf->lf_end == end)) {
1146                         LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1147                         return (OVERLAP_EQUALS_LOCK);
1148                 }
1149                 if ((lf->lf_start <= start) &&
1150                     (end != -1) &&
1151                     ((lf->lf_end >= end) || (lf->lf_end == -1))) {
1152                         LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1153                         return (OVERLAP_CONTAINS_LOCK);
1154                 }
1155                 if (start <= lf->lf_start &&
1156                            (end == -1 ||
1157                            (lf->lf_end != -1 && end >= lf->lf_end))) {
1158                         LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1159                         return (OVERLAP_CONTAINED_BY_LOCK);
1160                 }
1161                 if ((lf->lf_start < start) &&
1162                         ((lf->lf_end >= start) || (lf->lf_end == -1))) {
1163                         LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1164                         return (OVERLAP_STARTS_BEFORE_LOCK);
1165                 }
1166                 if ((lf->lf_start > start) &&
1167                         (end != -1) &&
1168                         ((lf->lf_end > end) || (lf->lf_end == -1))) {
1169                         LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1170                         return (OVERLAP_ENDS_AFTER_LOCK);
1171                 }
1172                 panic("lf_findoverlap: default");
1173         }
1174         return (OVERLAP_NONE);
1175 }
1176
1177
1178 /*
1179  * lf_split
1180  *
1181  * Description: Split a lock and a contained region into two or three locks
1182  *              as necessary.
1183  *
1184  * Parameters:  lock1                   Lock to split
1185  *              lock2                   Overlapping lock region requiring the
1186  *                                      split (upgrade/downgrade/unlock)
1187  *
1188  * Returns:     0                       Success
1189  *              ENOLCK                  No memory for new lock
1190  *
1191  * Implicit Returns:
1192  *              *lock1                  Modified original lock
1193  *              *lock2                  Overlapping lock (inserted into list)
1194  *              (new lock)              Potential new lock inserted into list
1195  *                                      if split results in 3 locks
1196  *
1197  * Notes:       This operation can only fail if the split would result in three
1198  *              locks, and there is insufficient memory to allocate the third
1199  *              lock; in that case, neither of the locks will be modified.
1200  */
1201 static int
1202 lf_split(struct lockf *lock1, struct lockf *lock2)
1203 {
1204         struct lockf *splitlock;
1205
1206 #ifdef LOCKF_DEBUGGING
1207         if (lockf_debug & LF_DBG_LIST) {
1208                 lf_print("lf_split", lock1);
1209                 lf_print("splitting from", lock2);
1210         }
1211 #endif /* LOCKF_DEBUGGING */
1212         /*
1213          * Check to see if splitting into only two pieces.
1214          */
1215         if (lock1->lf_start == lock2->lf_start) {
1216                 lock1->lf_start = lock2->lf_end + 1;
1217                 lock2->lf_next = lock1;
1218                 return (0);
1219         }
1220         if (lock1->lf_end == lock2->lf_end) {
1221                 lock1->lf_end = lock2->lf_start - 1;
1222                 lock2->lf_next = lock1->lf_next;
1223                 lock1->lf_next = lock2;
1224                 return (0);
1225         }
1226         /*
1227          * Make a new lock consisting of the last part of
1228          * the encompassing lock
1229          */
1230         MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1231         if (splitlock == NULL)
1232                 return (ENOLCK);
1233         bcopy(lock1, splitlock, sizeof *splitlock);
1234         splitlock->lf_start = lock2->lf_end + 1;
1235         TAILQ_INIT(&splitlock->lf_blkhd);
1236         lock1->lf_end = lock2->lf_start - 1;
1237         /*
1238          * OK, now link it in
1239          */
1240         splitlock->lf_next = lock1->lf_next;
1241         lock2->lf_next = splitlock;
1242         lock1->lf_next = lock2;
1243
1244         return (0);
1245 }
1246
1247
1248 /*
1249  * lf_wakelock
1250  *
1251  * Wakeup a blocklist in the case of a downgrade or unlock, since others
1252  * waiting on the lock may now be able to acquire it.
1253  *
1254  * Parameters:  listhead                Lock list head on which waiters may
1255  *                                      have pending locks
1256  *
1257  * Returns:     <void>
1258  *
1259  * Notes:       This function iterates a list of locks and wakes all waiters,
1260  *              rather than only waiters for the contended regions.  Because
1261  *              of this, for heavily contended files, this can result in a
1262  *              "thundering herd" situation.  Refactoring the code could make
1263  *              this operation more efficient, if heavy contention ever results
1264  *              in a real-world performance problem.
1265  */
1266 static void
1267 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1268 {
1269         struct lockf *wakelock;
1270         boolean_t wake_all = TRUE;
1271
1272         if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE))
1273                 wake_all = FALSE;
1274
1275         while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1276                 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1277                 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1278
1279                 wakelock->lf_next = NOLOCKF;
1280 #ifdef LOCKF_DEBUGGING
1281                 if (lockf_debug & LF_DBG_LOCKOP)
1282                         lf_print("lf_wakelock: awakening", wakelock);
1283 #endif /* LOCKF_DEBUGGING */
1284                 if (wake_all == FALSE) {
1285                         /*
1286                          * If there are items on the list head block list,
1287                          * move them to the wakelock list instead, and then
1288                          * correct their lf_next pointers.
1289                          */
1290                         if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1291                                 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1292
1293                                 struct lockf *tlock;
1294
1295                                 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1296                                         if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1297                                                 /* See rdar://10887303 */
1298                                                 panic("cycle in wakelock list");
1299                                         }
1300                                         tlock->lf_next = wakelock;
1301                                 }
1302                         }
1303                 }
1304                 wakeup(wakelock);
1305
1306                 if (wake_all == FALSE)
1307                         break;
1308         }
1309 }
1310
1311
1312 #ifdef LOCKF_DEBUGGING
1313 #define GET_LF_OWNER_PID(lf)    (proc_pid((lf)->lf_owner))
1314
1315 /*
1316  * lf_print DEBUG
1317  *
1318  * Print out a lock; lock information is prefixed by the string in 'tag'
1319  *
1320  * Parameters:  tag                     A string tag for debugging
1321  *              lock                    The lock whose information should be
1322  *                                      displayed
1323  *
1324  * Returns:     <void>
1325  */
1326 void
1327 lf_print(const char *tag, struct lockf *lock)
1328 {
1329         printf("%s: lock %p for ", tag, (void *)lock);
1330         if (lock->lf_flags & F_POSIX)
1331                 printf("proc %p (owner %d)",
1332                     lock->lf_id, GET_LF_OWNER_PID(lock));
1333         else if (lock->lf_flags & F_OFD_LOCK)
1334                 printf("fg %p (owner %d)",
1335                     lock->lf_id, GET_LF_OWNER_PID(lock));
1336         else
1337                 printf("id %p", (void *)lock->lf_id);
1338         if (lock->lf_vnode != 0)
1339                 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1340                     lock->lf_vnode,
1341                     lock->lf_type == F_RDLCK ? "shared" :
1342                     lock->lf_type == F_WRLCK ? "exclusive" :
1343                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1344                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1345         else
1346                 printf(" %s, start 0x%016llx, end 0x%016llx",
1347                     lock->lf_type == F_RDLCK ? "shared" :
1348                     lock->lf_type == F_WRLCK ? "exclusive" :
1349                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1350                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1351         if (!TAILQ_EMPTY(&lock->lf_blkhd))
1352                 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1353         else
1354                 printf("\n");
1355 }
1356
1357
1358 /*
1359  * lf_printlist DEBUG
1360  *
1361  * Print out a lock list for the vnode associated with 'lock'; lock information
1362  * is prefixed by the string in 'tag'
1363  *
1364  * Parameters:  tag                     A string tag for debugging
1365  *              lock                    The lock whose vnode's lock list should
1366  *                                      be displayed
1367  *
1368  * Returns:     <void>
1369  */
1370 void
1371 lf_printlist(const char *tag, struct lockf *lock)
1372 {
1373         struct lockf *lf, *blk;
1374
1375         if (lock->lf_vnode == 0)
1376                 return;
1377
1378         printf("%s: Lock list for vno %p:\n",
1379             tag, lock->lf_vnode);
1380         for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1381                 printf("\tlock %p for ",(void *)lf);
1382                 if (lf->lf_flags & F_POSIX)
1383                         printf("proc %p (owner %d)",
1384                             lf->lf_id, GET_LF_OWNER_PID(lf));
1385                 else if (lf->lf_flags & F_OFD_LOCK)
1386                         printf("fg %p (owner %d)",
1387                             lf->lf_id, GET_LF_OWNER_PID(lf));
1388                 else
1389                         printf("id %p", (void *)lf->lf_id);
1390                 printf(", %s, start 0x%016llx, end 0x%016llx",
1391                     lf->lf_type == F_RDLCK ? "shared" :
1392                     lf->lf_type == F_WRLCK ? "exclusive" :
1393                     lf->lf_type == F_UNLCK ? "unlock" :
1394                     "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1395                 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1396                         printf("\n\t\tlock request %p for ", (void *)blk);
1397                         if (blk->lf_flags & F_POSIX)
1398                                 printf("proc %p (owner %d)",
1399                                     blk->lf_id, GET_LF_OWNER_PID(blk));
1400                         else if (blk->lf_flags & F_OFD_LOCK)
1401                                 printf("fg %p (owner %d)",
1402                                     blk->lf_id, GET_LF_OWNER_PID(blk));
1403                         else
1404                                 printf("id %p", (void *)blk->lf_id);
1405                         printf(", %s, start 0x%016llx, end 0x%016llx",
1406                             blk->lf_type == F_RDLCK ? "shared" :
1407                             blk->lf_type == F_WRLCK ? "exclusive" :
1408                             blk->lf_type == F_UNLCK ? "unlock" :
1409                             "unknown", (intmax_t)blk->lf_start,
1410                             (intmax_t)blk->lf_end);
1411                         if (!TAILQ_EMPTY(&blk->lf_blkhd))
1412                                 panic("lf_printlist: bad list");
1413                 }
1414                 printf("\n");
1415         }
1416 }
1417 #endif /* LOCKF_DEBUGGING */
1418
1419 #if IMPORTANCE_INHERITANCE
1420
1421 /*
1422  * lf_hold_assertion
1423  *
1424  * Call task importance hold assertion on the owner of the lock.
1425  *
1426  * Parameters: block_task               Owner of the lock blocking
1427  *                                      current thread.
1428  *
1429  *             block                    lock on which the current thread
1430  *                                      is blocking on.
1431  *
1432  * Returns:    <void>
1433  *
1434  * Notes: The task reference on block_task is not needed to be hold since
1435  *        the current thread has vnode lock and block_task has a file
1436  *        lock, thus removing file lock in exit requires block_task to
1437  *        grab the vnode lock.
1438  */
1439 static void
1440 lf_hold_assertion(task_t block_task, struct lockf *block)
1441 {
1442         if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
1443                 block->lf_boosted = LF_BOOSTED;
1444                 LOCKF_DEBUG(LF_DBG_IMPINH,
1445                     "lf: importance hold file lock assert on pid %d lock %p\n",
1446                     proc_pid(block->lf_owner), block);
1447         }
1448 }
1449
1450
1451 /*
1452  * lf_jump_to_queue_head
1453  *
1454  * Jump the lock from the tail of the block queue to the head of
1455  * the queue.
1456  *
1457  * Parameters: block                    lockf struct containing the
1458  *                                      block queue.
1459  *             lock                     lockf struct to be jumped to the
1460  *                                      front.
1461  *
1462  * Returns:    <void>
1463  */
1464 static void
1465 lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1466 {
1467         /* Move the lock to the head of the block queue. */
1468         TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1469         TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1470 }
1471
1472
1473 /*
1474  * lf_drop_assertion
1475  *
1476  * Drops the task hold assertion.
1477  *
1478  * Parameters: block                    lockf struct holding the assertion.
1479  *
1480  * Returns:    <void>
1481  */
1482 static void
1483 lf_drop_assertion(struct lockf *block)
1484 {
1485         LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1486             proc_pid(block->lf_owner), block);
1487
1488         task_t current_task = proc_task(block->lf_owner);
1489         task_importance_drop_file_lock_assertion(current_task, 1);
1490         block->lf_boosted = LF_NOT_BOOSTED;
1491 }
1492
1493 /*
1494  * lf_adjust_assertion
1495  *
1496  * Adjusts importance assertion of file lock. Goes through
1497  * all the blocking locks and checks if the file lock needs
1498  * to be boosted anymore.
1499  *
1500  * Parameters: block    lockf structure which needs to be adjusted.
1501  *
1502  * Returns:     <void>
1503  */
1504 static void
1505 lf_adjust_assertion(struct lockf *block)
1506 {
1507         boolean_t drop_boost = TRUE;
1508         struct lockf *next;
1509
1510         /* Return if the lock is not boosted */
1511         if (block->lf_boosted == LF_NOT_BOOSTED) {
1512                 return;
1513         }
1514
1515         TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1516                 /* Check if block and next are same type of locks */
1517                 if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
1518                     ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1519                      (block->lf_owner != next->lf_owner) &&
1520                      (NULL != block->lf_owner && NULL != next->lf_owner))) {
1521
1522                         /* Check if next would be boosting block */
1523                         if (task_is_importance_donor(proc_task(next->lf_owner)) &&
1524                             task_is_importance_receiver_type(proc_task(block->lf_owner))) {
1525                                 /* Found a lock boosting block */
1526                                 drop_boost = FALSE;
1527                                 break;
1528                         }
1529                 }
1530         }
1531
1532         if (drop_boost) {
1533                 lf_drop_assertion(block);
1534         }
1535 }
1536
1537 static void
1538 lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1539 {
1540         task_t ltask = proc_task(lock->lf_owner);
1541         task_t btask = proc_task(block->lf_owner);
1542
1543         /*
1544          * Check if ltask can donate importance. The
1545          * check of imp_donor bit is done without holding
1546          * any lock. The value may change after you read it,
1547          * but it is ok to boost a task while someone else is
1548          * unboosting you.
1549          *
1550          * TODO: Support live inheritance on file locks.
1551          */
1552         if (task_is_importance_donor(ltask)) {
1553                 LOCKF_DEBUG(LF_DBG_IMPINH,
1554                     "lf: %d: attempt to boost pid %d that holds lock %p\n",
1555                     proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1556
1557                 if (block->lf_boosted != LF_BOOSTED &&
1558                     task_is_importance_receiver_type(btask)) {
1559                         lf_hold_assertion(btask, block);
1560                 }
1561                 lf_jump_to_queue_head(block, lock);
1562         }
1563 }
1564 #endif /* IMPORTANCE_INHERITANCE */