]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_lockf.c
xnu-1504.9.17.tar.gz
[apple/xnu.git] / bsd / kern / kern_lockf.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60 */
61
62 #include <sys/cdefs.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/lock.h>
67 #include <sys/mount.h>
68 #include <sys/proc.h>
69 #include <sys/signalvar.h>
70 #include <sys/unistd.h>
71 #include <sys/user.h>
72 #include <sys/vnode.h>
73 #include <sys/vnode_internal.h>
74 #include <sys/vnode_if.h>
75 #include <sys/malloc.h>
76 #include <sys/fcntl.h>
77 #include <sys/lockf.h>
78
79 /*
80 * This variable controls the maximum number of processes that will
81 * be checked in doing deadlock detection.
82 */
83 static int maxlockdepth = MAXDEPTH;
84
85 #ifdef LOCKF_DEBUGGING
86 #include <sys/sysctl.h>
87 #include <ufs/ufs/quota.h>
88 #include <ufs/ufs/inode.h>
89 void lf_print(const char *tag, struct lockf *lock);
90 void lf_printlist(const char *tag, struct lockf *lock);
91 static int lockf_debug = 2;
92 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
93
94 /*
95 * If there is no mask bit selector, or there is on, and the selector is
96 * set, then output the debugging diagnostic.
97 */
98 #define LOCKF_DEBUG(mask, ...) \
99 do { \
100 if( !(mask) || ((mask) & lockf_debug)) { \
101 printf(__VA_ARGS__); \
102 } \
103 } while(0)
104 #else /* !LOCKF_DEBUGGING */
105 #define LOCKF_DEBUG(mask, ...) /* mask */
106 #endif /* !LOCKF_DEBUGGING */
107
108 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
109
110 #define NOLOCKF (struct lockf *)0
111 #define SELF 0x1
112 #define OTHERS 0x2
113 #define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
114
115 /*
116 * Overlapping lock states
117 */
118 typedef enum {
119 OVERLAP_NONE = 0,
120 OVERLAP_EQUALS_LOCK,
121 OVERLAP_CONTAINS_LOCK,
122 OVERLAP_CONTAINED_BY_LOCK,
123 OVERLAP_STARTS_BEFORE_LOCK,
124 OVERLAP_ENDS_AFTER_LOCK
125 } overlap_t;
126
127 static int lf_clearlock(struct lockf *);
128 static overlap_t lf_findoverlap(struct lockf *,
129 struct lockf *, int, struct lockf ***, struct lockf **);
130 static struct lockf *lf_getblock(struct lockf *);
131 static int lf_getlock(struct lockf *, struct flock *);
132 static int lf_setlock(struct lockf *);
133 static int lf_split(struct lockf *, struct lockf *);
134 static void lf_wakelock(struct lockf *, boolean_t);
135
136
137 /*
138 * lf_advlock
139 *
140 * Description: Advisory record locking support
141 *
142 * Parameters: ap Argument pointer to a vnop_advlock_args
143 * argument descriptor structure for the
144 * lock operation to be attempted.
145 *
146 * Returns: 0 Success
147 * EOVERFLOW
148 * EINVAL
149 * ENOLCK Number of locked regions exceeds limit
150 * lf_setlock:EAGAIN
151 * lf_setlock:EDEADLK
152 * lf_setlock:EINTR
153 * lf_setlock:ENOLCK
154 * lf_clearlock:ENOLCK
155 * vnode_size:???
156 *
157 * Notes: We return ENOLCK when we run out of memory to support locks; as
158 * such, there is no specific expectation limit other than the
159 * amount of available resources.
160 */
161 int
162 lf_advlock(struct vnop_advlock_args *ap)
163 {
164 struct vnode *vp = ap->a_vp;
165 struct flock *fl = ap->a_fl;
166 vfs_context_t context = ap->a_context;
167 struct lockf *lock;
168 off_t start, end, oadd;
169 u_quad_t size;
170 int error;
171 struct lockf **head = &vp->v_lockf;
172
173 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
174
175 /*
176 * Avoid the common case of unlocking when inode has no locks.
177 */
178 if (*head == (struct lockf *)0) {
179 if (ap->a_op != F_SETLK) {
180 fl->l_type = F_UNLCK;
181 LOCKF_DEBUG(0, "lf_advlock: '%s' unlock without lock\n", vfs_context_proc(context)->p_comm);
182 return (0);
183 }
184 }
185
186 /*
187 * Convert the flock structure into a start and end.
188 */
189 switch (fl->l_whence) {
190
191 case SEEK_SET:
192 case SEEK_CUR:
193 /*
194 * Caller is responsible for adding any necessary offset
195 * when SEEK_CUR is used.
196 */
197 start = fl->l_start;
198 break;
199
200 case SEEK_END:
201
202 /*
203 * It's OK to cast the u_quad_t to and off_t here, since they
204 * are the same storage size, and the value of the returned
205 * contents will never overflow into the sign bit. We need to
206 * do this because we will use size to force range checks.
207 */
208 if ((error = vnode_size(vp, (off_t *)&size, context))) {
209 LOCKF_DEBUG(0, "lf_advlock: vnode_getattr failed: %d\n", error);
210 return (error);
211 }
212
213 if (size > OFF_MAX ||
214 (fl->l_start > 0 &&
215 size > (u_quad_t)(OFF_MAX - fl->l_start)))
216 return (EOVERFLOW);
217 start = size + fl->l_start;
218 break;
219
220 default:
221 LOCKF_DEBUG(0, "lf_advlock: unknown whence %d\n", fl->l_whence);
222 return (EINVAL);
223 }
224 if (start < 0) {
225 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
226 return (EINVAL);
227 }
228 if (fl->l_len < 0) {
229 if (start == 0) {
230 LOCKF_DEBUG(0, "lf_advlock: len < 0 & start == 0\n");
231 return (EINVAL);
232 }
233 end = start - 1;
234 start += fl->l_len;
235 if (start < 0) {
236 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
237 return (EINVAL);
238 }
239 } else if (fl->l_len == 0)
240 end = -1;
241 else {
242 oadd = fl->l_len - 1;
243 if (oadd > (off_t)(OFF_MAX - start)) {
244 LOCKF_DEBUG(0, "lf_advlock: overflow\n");
245 return (EOVERFLOW);
246 }
247 end = start + oadd;
248 }
249 /*
250 * Create the lockf structure
251 */
252 MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
253 if (lock == NULL)
254 return (ENOLCK);
255 lock->lf_start = start;
256 lock->lf_end = end;
257 lock->lf_id = ap->a_id;
258 lock->lf_vnode = vp;
259 lock->lf_type = fl->l_type;
260 lock->lf_head = head;
261 lock->lf_next = (struct lockf *)0;
262 TAILQ_INIT(&lock->lf_blkhd);
263 lock->lf_flags = ap->a_flags;
264
265 if (ap->a_flags & F_FLOCK)
266 lock->lf_flags |= F_WAKE1_SAFE;
267
268 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */
269 /*
270 * Do the requested operation.
271 */
272 switch(ap->a_op) {
273 case F_SETLK:
274 error = lf_setlock(lock);
275 break;
276
277 case F_UNLCK:
278 error = lf_clearlock(lock);
279 FREE(lock, M_LOCKF);
280 break;
281
282 case F_GETLK:
283 error = lf_getlock(lock, fl);
284 FREE(lock, M_LOCKF);
285 break;
286
287 default:
288 FREE(lock, M_LOCKF);
289 error = EINVAL;
290 break;
291 }
292 lck_mtx_unlock(&vp->v_lock); /* done maniplulating the list */
293
294 LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error);
295 return (error);
296 }
297
298
299 /*
300 * lf_coelesce_adjacent
301 *
302 * Description: Helper function: when setting a lock, coelesce adjacent
303 * locks. Needed because adjacent locks are not overlapping,
304 * but POSIX requires that they be coelesced.
305 *
306 * Parameters: lock The new lock which may be adjacent
307 * to already locked reagions, and which
308 * should therefore be coelesced with them
309 *
310 * Returns: <void>
311 */
312 static void
313 lf_coelesce_adjacent(struct lockf *lock)
314 {
315 struct lockf **lf = lock->lf_head;
316
317 while (*lf != NOLOCKF) {
318 /* reject locks that obviously could not be coelesced */
319 if ((*lf == lock) ||
320 ((*lf)->lf_id != lock->lf_id) ||
321 ((*lf)->lf_type != lock->lf_type)) {
322 lf = &(*lf)->lf_next;
323 continue;
324 }
325
326 /* If the lock ends adjacent to us, we can coelesce it */
327 if ((*lf)->lf_end != -1 &&
328 ((*lf)->lf_end + 1) == lock->lf_start) {
329 struct lockf *adjacent = *lf;
330
331 LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n");
332 lock->lf_start = (*lf)->lf_start;
333 *lf = lock;
334 lf = &(*lf)->lf_next;
335 FREE(adjacent, M_LOCKF);
336 continue;
337 }
338 /* If the lock starts adjacent to us, we can coelesce it */
339 if (lock->lf_end != -1 &&
340 (lock->lf_end + 1) == (*lf)->lf_start) {
341 struct lockf *adjacent = *lf;
342
343 LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n");
344 lock->lf_end = (*lf)->lf_end;
345 lock->lf_next = (*lf)->lf_next;
346 lf = &lock->lf_next;
347 FREE(adjacent, M_LOCKF);
348 continue;
349 }
350
351 /* no matching conditions; go on to next lock */
352 lf = &(*lf)->lf_next;
353 }
354 }
355
356
357 /*
358 * lf_setlock
359 *
360 * Description: Set a byte-range lock.
361 *
362 * Parameters: lock The lock structure describing the lock
363 * to be set; allocated by the caller, it
364 * will be linked into the lock list if
365 * the set is successful, and freed if the
366 * set is unsuccessful.
367 *
368 * Returns: 0 Success
369 * EAGAIN
370 * EDEADLK
371 * lf_split:ENOLCK
372 * lf_clearlock:ENOLCK
373 * msleep:EINTR
374 *
375 * Notes: We add the lock to the provisional lock list. We do not
376 * coelesce at this time; this has implications for other lock
377 * requestors in the blocker search mechanism.
378 */
379 static int
380 lf_setlock(struct lockf *lock)
381 {
382 struct lockf *block;
383 struct lockf **head = lock->lf_head;
384 struct lockf **prev, *overlap, *ltmp;
385 static char lockstr[] = "lockf";
386 int priority, needtolink, error;
387 struct vnode *vp = lock->lf_vnode;
388 overlap_t ovcase;
389
390 #ifdef LOCKF_DEBUGGING
391 if (lockf_debug & 1) {
392 lf_print("lf_setlock", lock);
393 lf_printlist("lf_setlock(in)", lock);
394 }
395 #endif /* LOCKF_DEBUGGING */
396
397 /*
398 * Set the priority
399 */
400 priority = PLOCK;
401 if (lock->lf_type == F_WRLCK)
402 priority += 4;
403 priority |= PCATCH;
404 /*
405 * Scan lock list for this file looking for locks that would block us.
406 */
407 while ((block = lf_getblock(lock))) {
408 /*
409 * Free the structure and return if nonblocking.
410 */
411 if ((lock->lf_flags & F_WAIT) == 0) {
412 FREE(lock, M_LOCKF);
413 return (EAGAIN);
414 }
415
416 /*
417 * We are blocked. Since flock style locks cover
418 * the whole file, there is no chance for deadlock.
419 * For byte-range locks we must check for deadlock.
420 *
421 * Deadlock detection is done by looking through the
422 * wait channels to see if there are any cycles that
423 * involve us. MAXDEPTH is set just to make sure we
424 * do not go off into neverland.
425 */
426 if ((lock->lf_flags & F_POSIX) &&
427 (block->lf_flags & F_POSIX)) {
428 struct proc *wproc, *bproc;
429 struct uthread *ut;
430 struct lockf *waitblock;
431 int i = 0;
432
433 /* The block is waiting on something */
434 wproc = (struct proc *)block->lf_id;
435 proc_lock(wproc);
436 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
437 /*
438 * While the thread is asleep (uu_wchan != 0)
439 * in this code (uu_wmesg == lockstr)
440 * and we have not exceeded the maximum cycle
441 * depth (i < maxlockdepth), then check for a
442 * cycle to see if the lock is blocked behind
443 * someone blocked behind us.
444 */
445 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
446 ut->uu_wmesg == lockstr &&
447 (i++ < maxlockdepth)) {
448 waitblock = (struct lockf *)ut->uu_wchan;
449 /*
450 * Get the lock blocking the lock
451 * which would block us, and make
452 * certain it hasn't come unblocked
453 * (been granted, e.g. between the time
454 * we called lf_getblock, and the time
455 * we successfully acquired the
456 * proc_lock).
457 */
458 waitblock = waitblock->lf_next;
459 if (waitblock == NULL)
460 break;
461
462 /*
463 * Make sure it's an advisory range
464 * lock and not an overall file lock;
465 * if we mix lock types, it's our own
466 * fault.
467 */
468 if ((waitblock->lf_flags & F_POSIX) == 0)
469 break;
470
471 /*
472 * If the owner of the lock that's
473 * blocking a lock that's blocking us
474 * getting the requested lock, then we
475 * would deadlock, so error out.
476 */
477 bproc = (struct proc *)waitblock->lf_id;
478 if (bproc == (struct proc *)lock->lf_id) {
479 proc_unlock(wproc);
480 FREE(lock, M_LOCKF);
481 return (EDEADLK);
482 }
483 }
484 }
485 proc_unlock(wproc);
486 }
487
488 /*
489 * For flock type locks, we must first remove
490 * any shared locks that we hold before we sleep
491 * waiting for an exclusive lock.
492 */
493 if ((lock->lf_flags & F_FLOCK) &&
494 lock->lf_type == F_WRLCK) {
495 lock->lf_type = F_UNLCK;
496 if ((error = lf_clearlock(lock)) != 0) {
497 FREE(lock, M_LOCKF);
498 return (error);
499 }
500 lock->lf_type = F_WRLCK;
501 }
502 /*
503 * Add our lock to the blocked list and sleep until we're free.
504 * Remember who blocked us (for deadlock detection).
505 */
506 lock->lf_next = block;
507 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
508
509 if ( !(lock->lf_flags & F_FLOCK))
510 block->lf_flags &= ~F_WAKE1_SAFE;
511
512 #ifdef LOCKF_DEBUGGING
513 if (lockf_debug & 1) {
514 lf_print("lf_setlock: blocking on", block);
515 lf_printlist("lf_setlock(block)", block);
516 }
517 #endif /* LOCKF_DEBUGGING */
518 error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
519
520 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
521 struct lockf *tlock;
522
523 if ((block = lf_getblock(lock))) {
524 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
525 tlock->lf_next = block;
526 }
527 TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block);
528 }
529 }
530 if (error) { /* XXX */
531 /*
532 * We may have been awakened by a signal and/or by a
533 * debugger continuing us (in which cases we must remove
534 * ourselves from the blocked list) and/or by another
535 * process releasing a lock (in which case we have
536 * already been removed from the blocked list and our
537 * lf_next field set to NOLOCKF).
538 */
539 if (lock->lf_next) {
540 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
541 lock->lf_next = NOLOCKF;
542 }
543 if (!TAILQ_EMPTY(&lock->lf_blkhd))
544 lf_wakelock(lock, TRUE);
545
546 FREE(lock, M_LOCKF);
547 return (error);
548 } /* XXX */
549 }
550 /*
551 * No blocks!! Add the lock. Note that we will
552 * downgrade or upgrade any overlapping locks this
553 * process already owns.
554 *
555 * Skip over locks owned by other processes.
556 * Handle any locks that overlap and are owned by ourselves.
557 */
558 prev = head;
559 block = *head;
560 needtolink = 1;
561 for (;;) {
562 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
563 if (ovcase)
564 block = overlap->lf_next;
565 /*
566 * Six cases:
567 * 0) no overlap
568 * 1) overlap == lock
569 * 2) overlap contains lock
570 * 3) lock contains overlap
571 * 4) overlap starts before lock
572 * 5) overlap ends after lock
573 */
574 switch (ovcase) {
575 case OVERLAP_NONE:
576 if (needtolink) {
577 *prev = lock;
578 lock->lf_next = overlap;
579 }
580 break;
581
582 case OVERLAP_EQUALS_LOCK:
583 /*
584 * If downgrading lock, others may be
585 * able to acquire it.
586 */
587 if (lock->lf_type == F_RDLCK &&
588 overlap->lf_type == F_WRLCK)
589 lf_wakelock(overlap, TRUE);
590 overlap->lf_type = lock->lf_type;
591 FREE(lock, M_LOCKF);
592 lock = overlap; /* for lf_coelesce_adjacent() */
593 break;
594
595 case OVERLAP_CONTAINS_LOCK:
596 /*
597 * Check for common starting point and different types.
598 */
599 if (overlap->lf_type == lock->lf_type) {
600 FREE(lock, M_LOCKF);
601 lock = overlap; /* for lf_coelesce_adjacent() */
602 break;
603 }
604 if (overlap->lf_start == lock->lf_start) {
605 *prev = lock;
606 lock->lf_next = overlap;
607 overlap->lf_start = lock->lf_end + 1;
608 } else {
609 /*
610 * If we can't split the lock, we can't
611 * grant it. Claim a system limit for the
612 * resource shortage.
613 */
614 if (lf_split(overlap, lock)) {
615 FREE(lock, M_LOCKF);
616 return (ENOLCK);
617 }
618 }
619 lf_wakelock(overlap, TRUE);
620 break;
621
622 case OVERLAP_CONTAINED_BY_LOCK:
623 /*
624 * If downgrading lock, others may be able to
625 * acquire it, otherwise take the list.
626 */
627 if (lock->lf_type == F_RDLCK &&
628 overlap->lf_type == F_WRLCK) {
629 lf_wakelock(overlap, TRUE);
630 } else {
631 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
632 ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
633 TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
634 lf_block);
635 TAILQ_INSERT_TAIL(&lock->lf_blkhd,
636 ltmp, lf_block);
637 ltmp->lf_next = lock;
638 }
639 }
640 /*
641 * Add the new lock if necessary and delete the overlap.
642 */
643 if (needtolink) {
644 *prev = lock;
645 lock->lf_next = overlap->lf_next;
646 prev = &lock->lf_next;
647 needtolink = 0;
648 } else
649 *prev = overlap->lf_next;
650 FREE(overlap, M_LOCKF);
651 continue;
652
653 case OVERLAP_STARTS_BEFORE_LOCK:
654 /*
655 * Add lock after overlap on the list.
656 */
657 lock->lf_next = overlap->lf_next;
658 overlap->lf_next = lock;
659 overlap->lf_end = lock->lf_start - 1;
660 prev = &lock->lf_next;
661 lf_wakelock(overlap, TRUE);
662 needtolink = 0;
663 continue;
664
665 case OVERLAP_ENDS_AFTER_LOCK:
666 /*
667 * Add the new lock before overlap.
668 */
669 if (needtolink) {
670 *prev = lock;
671 lock->lf_next = overlap;
672 }
673 overlap->lf_start = lock->lf_end + 1;
674 lf_wakelock(overlap, TRUE);
675 break;
676 }
677 break;
678 }
679 /* Coelesce adjacent locks with identical attributes */
680 lf_coelesce_adjacent(lock);
681 #ifdef LOCKF_DEBUGGING
682 if (lockf_debug & 1) {
683 lf_print("lf_setlock: got the lock", lock);
684 lf_printlist("lf_setlock(out)", lock);
685 }
686 #endif /* LOCKF_DEBUGGING */
687 return (0);
688 }
689
690
691 /*
692 * lf_clearlock
693 *
694 * Description: Remove a byte-range lock on an vnode. Generally, find the
695 * lock (or an overlap to that lock) and remove it (or shrink
696 * it), then wakeup anyone we can.
697 *
698 * Parameters: unlock The lock to clear
699 *
700 * Returns: 0 Success
701 * lf_split:ENOLCK
702 *
703 * Notes: A caller may unlock all the locks owned by the caller by
704 * specifying the entire file range; locks owned by other
705 * callers are not effected by this operation.
706 */
707 static int
708 lf_clearlock(struct lockf *unlock)
709 {
710 struct lockf **head = unlock->lf_head;
711 struct lockf *lf = *head;
712 struct lockf *overlap, **prev;
713 overlap_t ovcase;
714
715 if (lf == NOLOCKF)
716 return (0);
717 #ifdef LOCKF_DEBUGGING
718 if (unlock->lf_type != F_UNLCK)
719 panic("lf_clearlock: bad type");
720 if (lockf_debug & 1)
721 lf_print("lf_clearlock", unlock);
722 #endif /* LOCKF_DEBUGGING */
723 prev = head;
724 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
725 /*
726 * Wakeup the list of locks to be retried.
727 */
728 lf_wakelock(overlap, FALSE);
729
730 switch (ovcase) {
731 case OVERLAP_NONE: /* satisfy compiler enum/switch */
732 break;
733
734 case OVERLAP_EQUALS_LOCK:
735 *prev = overlap->lf_next;
736 FREE(overlap, M_LOCKF);
737 break;
738
739 case OVERLAP_CONTAINS_LOCK: /* split it */
740 if (overlap->lf_start == unlock->lf_start) {
741 overlap->lf_start = unlock->lf_end + 1;
742 break;
743 }
744 /*
745 * If we can't split the lock, we can't grant it.
746 * Claim a system limit for the resource shortage.
747 */
748 if (lf_split(overlap, unlock))
749 return (ENOLCK);
750 overlap->lf_next = unlock->lf_next;
751 break;
752
753 case OVERLAP_CONTAINED_BY_LOCK:
754 *prev = overlap->lf_next;
755 lf = overlap->lf_next;
756 FREE(overlap, M_LOCKF);
757 continue;
758
759 case OVERLAP_STARTS_BEFORE_LOCK:
760 overlap->lf_end = unlock->lf_start - 1;
761 prev = &overlap->lf_next;
762 lf = overlap->lf_next;
763 continue;
764
765 case OVERLAP_ENDS_AFTER_LOCK:
766 overlap->lf_start = unlock->lf_end + 1;
767 break;
768 }
769 break;
770 }
771 #ifdef LOCKF_DEBUGGING
772 if (lockf_debug & 1)
773 lf_printlist("lf_clearlock", unlock);
774 #endif /* LOCKF_DEBUGGING */
775 return (0);
776 }
777
778
779 /*
780 * lf_getlock
781 *
782 * Description: Check whether there is a blocking lock, and if so return
783 * its process identifier into the lock being requested.
784 *
785 * Parameters: lock Pointer to lock to test for blocks
786 * fl Pointer to flock structure to receive
787 * the blocking lock information, if a
788 * blocking lock is found.
789 *
790 * Returns: 0 Success
791 *
792 * Implicit Returns:
793 * *fl Contents modified to reflect the
794 * blocking lock, if one is found; not
795 * modified otherwise
796 *
797 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
798 * the blocking process ID for advisory record locks.
799 */
800 static int
801 lf_getlock(struct lockf *lock, struct flock *fl)
802 {
803 struct lockf *block;
804
805 #ifdef LOCKF_DEBUGGING
806 if (lockf_debug & 1)
807 lf_print("lf_getlock", lock);
808 #endif /* LOCKF_DEBUGGING */
809
810 if ((block = lf_getblock(lock))) {
811 fl->l_type = block->lf_type;
812 fl->l_whence = SEEK_SET;
813 fl->l_start = block->lf_start;
814 if (block->lf_end == -1)
815 fl->l_len = 0;
816 else
817 fl->l_len = block->lf_end - block->lf_start + 1;
818 if (block->lf_flags & F_POSIX)
819 fl->l_pid = proc_pid((struct proc *)(block->lf_id));
820 else
821 fl->l_pid = -1;
822 } else {
823 fl->l_type = F_UNLCK;
824 }
825 return (0);
826 }
827
828
829 /*
830 * lf_getblock
831 *
832 * Description: Walk the list of locks for an inode and return the first
833 * blocking lock. A lock is considered blocking if we are not
834 * the lock owner; otherwise, we are permitted to upgrade or
835 * downgrade it, and it's not considered blocking.
836 *
837 * Parameters: lock The lock for which we are interested
838 * in obtaining the blocking lock, if any
839 *
840 * Returns: NOLOCKF No blocking lock exists
841 * !NOLOCKF The address of the blocking lock's
842 * struct lockf.
843 */
844 static struct lockf *
845 lf_getblock(struct lockf *lock)
846 {
847 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
848 int ovcase;
849
850 prev = lock->lf_head;
851 while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) != OVERLAP_NONE) {
852 /*
853 * We've found an overlap, see if it blocks us
854 */
855 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
856 return (overlap);
857 /*
858 * Nope, point to the next one on the list and
859 * see if it blocks us
860 */
861 lf = overlap->lf_next;
862 }
863 return (NOLOCKF);
864 }
865
866
867 /*
868 * lf_findoverlap
869 *
870 * Description: Walk the list of locks to find an overlapping lock (if any).
871 *
872 * Parameters: lf First lock on lock list
873 * lock The lock we are checking for an overlap
874 * check Check type
875 * prev pointer to pointer pointer to contain
876 * address of pointer to previous lock
877 * pointer to overlapping lock, if overlap
878 * overlap pointer to pointer to contain address
879 * of overlapping lock
880 *
881 * Returns: OVERLAP_NONE
882 * OVERLAP_EQUALS_LOCK
883 * OVERLAP_CONTAINS_LOCK
884 * OVERLAP_CONTAINED_BY_LOCK
885 * OVERLAP_STARTS_BEFORE_LOCK
886 * OVERLAP_ENDS_AFTER_LOCK
887 *
888 * Implicit Returns:
889 * *prev The address of the next pointer in the
890 * lock previous to the overlapping lock;
891 * this is generally used to relink the
892 * lock list, avoiding a second iteration.
893 * *overlap The pointer to the overlapping lock
894 * itself; this is ussed to return data in
895 * the check == OTHERS case, and for the
896 * caller to modify the overlapping lock,
897 * in the check == SELF case
898 *
899 * Note: This returns only the FIRST overlapping lock. There may be
900 * more than one. lf_getlock will return the first blocking lock,
901 * while lf_setlock will iterate over all overlapping locks to
902 *
903 * The check parameter can be SELF, meaning we are looking for
904 * overelapping locks owned by us, or it can be OTHERS, meaning
905 * we are looking for overlapping locks owned by someone else so
906 * we can report a blocking lock on an F_GETLK request.
907 *
908 * The value of *overlap and *prev are modified, even if there is
909 * no overlapping lock found; always check the return code.
910 */
911 static overlap_t
912 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
913 struct lockf ***prev, struct lockf **overlap)
914 {
915 off_t start, end;
916
917 *overlap = lf;
918 if (lf == NOLOCKF)
919 return (0);
920 #ifdef LOCKF_DEBUGGING
921 if (lockf_debug & 2)
922 lf_print("lf_findoverlap: looking for overlap in", lock);
923 #endif /* LOCKF_DEBUGGING */
924 start = lock->lf_start;
925 end = lock->lf_end;
926 while (lf != NOLOCKF) {
927 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
928 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
929 *prev = &lf->lf_next;
930 *overlap = lf = lf->lf_next;
931 continue;
932 }
933 #ifdef LOCKF_DEBUGGING
934 if (lockf_debug & 2)
935 lf_print("\tchecking", lf);
936 #endif /* LOCKF_DEBUGGING */
937 /*
938 * OK, check for overlap
939 */
940 if ((lf->lf_end != -1 && start > lf->lf_end) ||
941 (end != -1 && lf->lf_start > end)) {
942 /* Case 0 */
943 LOCKF_DEBUG(2, "no overlap\n");
944 if ((type & SELF) && end != -1 && lf->lf_start > end)
945 return (OVERLAP_NONE);
946 *prev = &lf->lf_next;
947 *overlap = lf = lf->lf_next;
948 continue;
949 }
950 if ((lf->lf_start == start) && (lf->lf_end == end)) {
951 LOCKF_DEBUG(2, "overlap == lock\n");
952 return (OVERLAP_EQUALS_LOCK);
953 }
954 if ((lf->lf_start <= start) &&
955 (end != -1) &&
956 ((lf->lf_end >= end) || (lf->lf_end == -1))) {
957 LOCKF_DEBUG(2, "overlap contains lock\n");
958 return (OVERLAP_CONTAINS_LOCK);
959 }
960 if (start <= lf->lf_start &&
961 (end == -1 ||
962 (lf->lf_end != -1 && end >= lf->lf_end))) {
963 LOCKF_DEBUG(2, "lock contains overlap\n");
964 return (OVERLAP_CONTAINED_BY_LOCK);
965 }
966 if ((lf->lf_start < start) &&
967 ((lf->lf_end >= start) || (lf->lf_end == -1))) {
968 LOCKF_DEBUG(2, "overlap starts before lock\n");
969 return (OVERLAP_STARTS_BEFORE_LOCK);
970 }
971 if ((lf->lf_start > start) &&
972 (end != -1) &&
973 ((lf->lf_end > end) || (lf->lf_end == -1))) {
974 LOCKF_DEBUG(2, "overlap ends after lock\n");
975 return (OVERLAP_ENDS_AFTER_LOCK);
976 }
977 panic("lf_findoverlap: default");
978 }
979 return (OVERLAP_NONE);
980 }
981
982
983 /*
984 * lf_split
985 *
986 * Description: Split a lock and a contained region into two or three locks
987 * as necessary.
988 *
989 * Parameters: lock1 Lock to split
990 * lock2 Overlapping lock region requiring the
991 * split (upgrade/downgrade/unlock)
992 *
993 * Returns: 0 Success
994 * ENOLCK No memory for new lock
995 *
996 * Implicit Returns:
997 * *lock1 Modified original lock
998 * *lock2 Overlapping lock (inserted into list)
999 * (new lock) Potential new lock inserted into list
1000 * if split results in 3 locks
1001 *
1002 * Notes: This operation can only fail if the split would result in three
1003 * locks, and there is insufficient memory to allocate the third
1004 * lock; in that case, neither of the locks will be modified.
1005 */
1006 static int
1007 lf_split(struct lockf *lock1, struct lockf *lock2)
1008 {
1009 struct lockf *splitlock;
1010
1011 #ifdef LOCKF_DEBUGGING
1012 if (lockf_debug & 2) {
1013 lf_print("lf_split", lock1);
1014 lf_print("splitting from", lock2);
1015 }
1016 #endif /* LOCKF_DEBUGGING */
1017 /*
1018 * Check to see if spliting into only two pieces.
1019 */
1020 if (lock1->lf_start == lock2->lf_start) {
1021 lock1->lf_start = lock2->lf_end + 1;
1022 lock2->lf_next = lock1;
1023 return (0);
1024 }
1025 if (lock1->lf_end == lock2->lf_end) {
1026 lock1->lf_end = lock2->lf_start - 1;
1027 lock2->lf_next = lock1->lf_next;
1028 lock1->lf_next = lock2;
1029 return (0);
1030 }
1031 /*
1032 * Make a new lock consisting of the last part of
1033 * the encompassing lock
1034 */
1035 MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1036 if (splitlock == NULL)
1037 return (ENOLCK);
1038 bcopy(lock1, splitlock, sizeof *splitlock);
1039 splitlock->lf_start = lock2->lf_end + 1;
1040 TAILQ_INIT(&splitlock->lf_blkhd);
1041 lock1->lf_end = lock2->lf_start - 1;
1042 /*
1043 * OK, now link it in
1044 */
1045 splitlock->lf_next = lock1->lf_next;
1046 lock2->lf_next = splitlock;
1047 lock1->lf_next = lock2;
1048
1049 return (0);
1050 }
1051
1052
1053 /*
1054 * lf_wakelock
1055 *
1056 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1057 * waiting on the lock may now be able to acquire it.
1058 *
1059 * Parameters: listhead Lock list head on which waiters may
1060 * have pending locks
1061 *
1062 * Returns: <void>
1063 *
1064 * Notes: This function iterates a list of locks and wakes all waiters,
1065 * rather than only waiters for the contended regions. Because
1066 * of this, for heavily contended files, this can result in a
1067 * "thundering herd" situation. Refactoring the code could make
1068 * this operation more efficient, if heavy contention ever results
1069 * in a real-world performance problem.
1070 */
1071 static void
1072 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1073 {
1074 struct lockf *wakelock;
1075 boolean_t wake_all = TRUE;
1076
1077 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE))
1078 wake_all = FALSE;
1079
1080 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1081 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1082 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1083
1084 wakelock->lf_next = NOLOCKF;
1085 #ifdef LOCKF_DEBUGGING
1086 if (lockf_debug & 2)
1087 lf_print("lf_wakelock: awakening", wakelock);
1088 #endif /* LOCKF_DEBUGGING */
1089 if (wake_all == FALSE) {
1090 /*
1091 * If there are items on the list head block list,
1092 * move them to the wakelock list instead, and then
1093 * correct their lf_next pointers.
1094 */
1095 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1096 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1097
1098 struct lockf *tlock;
1099
1100 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1101 tlock->lf_next = wakelock;
1102 }
1103 }
1104 }
1105 wakeup(wakelock);
1106
1107 if (wake_all == FALSE)
1108 break;
1109 }
1110 }
1111
1112
1113 #ifdef LOCKF_DEBUGGING
1114 /*
1115 * lf_print DEBUG
1116 *
1117 * Print out a lock; lock information is prefixed by the string in 'tag'
1118 *
1119 * Parameters: tag A string tag for debugging
1120 * lock The lock whose information should be
1121 * displayed
1122 *
1123 * Returns: <void>
1124 */
1125 void
1126 lf_print(const char *tag, struct lockf *lock)
1127 {
1128 printf("%s: lock %p for ", tag, (void *)lock);
1129 if (lock->lf_flags & F_POSIX)
1130 printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
1131 else
1132 printf("id %p", (void *)lock->lf_id);
1133 if (lock->lf_vnode != 0)
1134 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1135 lock->lf_vnode,
1136 lock->lf_type == F_RDLCK ? "shared" :
1137 lock->lf_type == F_WRLCK ? "exclusive" :
1138 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1139 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1140 else
1141 printf(" %s, start 0x%016llx, end 0x%016llx",
1142 lock->lf_type == F_RDLCK ? "shared" :
1143 lock->lf_type == F_WRLCK ? "exclusive" :
1144 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1145 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1146 if (!TAILQ_EMPTY(&lock->lf_blkhd))
1147 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1148 else
1149 printf("\n");
1150 }
1151
1152
1153 /*
1154 * lf_printlist DEBUG
1155 *
1156 * Print out a lock list for the vnode associated with 'lock'; lock information
1157 * is prefixed by the string in 'tag'
1158 *
1159 * Parameters: tag A string tag for debugging
1160 * lock The lock whose vnode's lock list should
1161 * be displayed
1162 *
1163 * Returns: <void>
1164 */
1165 void
1166 lf_printlist(const char *tag, struct lockf *lock)
1167 {
1168 struct lockf *lf, *blk;
1169
1170 if (lock->lf_vnode == 0)
1171 return;
1172
1173 printf("%s: Lock list for vno %p:\n",
1174 tag, lock->lf_vnode);
1175 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1176 printf("\tlock %p for ",(void *)lf);
1177 if (lf->lf_flags & F_POSIX)
1178 printf("proc %ld",
1179 (long)((struct proc *)lf->lf_id)->p_pid);
1180 else
1181 printf("id %p", (void *)lf->lf_id);
1182 printf(", %s, start 0x%016llx, end 0x%016llx",
1183 lf->lf_type == F_RDLCK ? "shared" :
1184 lf->lf_type == F_WRLCK ? "exclusive" :
1185 lf->lf_type == F_UNLCK ? "unlock" :
1186 "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1187 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1188 printf("\n\t\tlock request %p for ", (void *)blk);
1189 if (blk->lf_flags & F_POSIX)
1190 printf("proc %ld",
1191 (long)((struct proc *)blk->lf_id)->p_pid);
1192 else
1193 printf("id %p", (void *)blk->lf_id);
1194 printf(", %s, start 0x%016llx, end 0x%016llx",
1195 blk->lf_type == F_RDLCK ? "shared" :
1196 blk->lf_type == F_WRLCK ? "exclusive" :
1197 blk->lf_type == F_UNLCK ? "unlock" :
1198 "unknown", (intmax_t)blk->lf_start,
1199 (intmax_t)blk->lf_end);
1200 if (!TAILQ_EMPTY(&blk->lf_blkhd))
1201 panic("lf_printlist: bad list");
1202 }
1203 printf("\n");
1204 }
1205 }
1206 #endif /* LOCKF_DEBUGGING */