]> git.saurik.com Git - apple/xnu.git/blob - bsd/ufs/ffs/ffs_alloc.c
275808fd4daeecde211dce4ee42a27561c384615
[apple/xnu.git] / bsd / ufs / ffs / ffs_alloc.c
1 /*
2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1982, 1986, 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95
56 */
57 #include <rev_endian_fs.h>
58 #include <vm/vm_pager.h>
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/buf_internal.h>
63 #include <sys/proc.h>
64 #include <sys/kauth.h>
65 #include <sys/vnode_internal.h>
66 #include <sys/mount_internal.h>
67 #include <sys/kernel.h>
68 #include <sys/syslog.h>
69 #include <sys/quota.h>
70
71 #include <sys/vm.h>
72
73 #include <ufs/ufs/quota.h>
74 #include <ufs/ufs/inode.h>
75
76 #include <ufs/ffs/fs.h>
77 #include <ufs/ffs/ffs_extern.h>
78
79 #if REV_ENDIAN_FS
80 #include <ufs/ufs/ufs_byte_order.h>
81 #include <architecture/byte_order.h>
82 #endif /* REV_ENDIAN_FS */
83
84 extern u_long nextgennumber;
85
86 static ufs_daddr_t ffs_alloccg(struct inode *, int, ufs_daddr_t, int);
87 static ufs_daddr_t ffs_alloccgblk(struct fs *, struct cg *, ufs_daddr_t);
88 static ufs_daddr_t ffs_clusteralloc(struct inode *, int, ufs_daddr_t, int);
89 static ino_t ffs_dirpref(struct inode *);
90 static ufs_daddr_t ffs_fragextend(struct inode *, int, long, int, int);
91 static void ffs_fserr(struct fs *, u_int, char *);
92 static u_long ffs_hashalloc
93 (struct inode *, int, long, int, u_int32_t (*)());
94 static ino_t ffs_nodealloccg(struct inode *, int, ufs_daddr_t, int);
95 static ufs_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs_daddr_t, int);
96 static void ffs_clusteracct
97 (struct fs *fs, struct cg *cgp, ufs_daddr_t blkno, int cnt);
98
99 /*
100 * Allocate a block in the file system.
101 *
102 * The size of the requested block is given, which must be some
103 * multiple of fs_fsize and <= fs_bsize.
104 * A preference may be optionally specified. If a preference is given
105 * the following hierarchy is used to allocate a block:
106 * 1) allocate the requested block.
107 * 2) allocate a rotationally optimal block in the same cylinder.
108 * 3) allocate a block in the same cylinder group.
109 * 4) quadradically rehash into other cylinder groups, until an
110 * available block is located.
111 * If no block preference is given the following heirarchy is used
112 * to allocate a block:
113 * 1) allocate a block in the cylinder group that contains the
114 * inode for the file.
115 * 2) quadradically rehash into other cylinder groups, until an
116 * available block is located.
117 */
118 ffs_alloc(ip, lbn, bpref, size, cred, bnp)
119 register struct inode *ip;
120 ufs_daddr_t lbn, bpref;
121 int size;
122 kauth_cred_t cred;
123 ufs_daddr_t *bnp;
124 {
125 register struct fs *fs;
126 ufs_daddr_t bno;
127 int cg, error;
128 int devBlockSize=0;
129 *bnp = 0;
130 fs = ip->i_fs;
131 #if DIAGNOSTIC
132 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
133 printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
134 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
135 panic("ffs_alloc: bad size");
136 }
137 if (cred == NOCRED)
138 panic("ffs_alloc: missing credential\n");
139 #endif /* DIAGNOSTIC */
140 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
141 goto nospace;
142 if (suser(cred, NULL) && freespace(fs, fs->fs_minfree) <= 0)
143 goto nospace;
144 devBlockSize = vfs_devblocksize(vnode_mount(ITOV(ip)));
145 #if QUOTA
146 if (error = chkdq(ip, (int64_t)size, cred, 0))
147 return (error);
148 #endif /* QUOTA */
149 if (bpref >= fs->fs_size)
150 bpref = 0;
151 if (bpref == 0)
152 cg = ino_to_cg(fs, ip->i_number);
153 else
154 cg = dtog(fs, bpref);
155 bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size,
156 (u_int32_t (*)())ffs_alloccg);
157 if (bno > 0) {
158 ip->i_blocks += btodb(size, devBlockSize);
159 ip->i_flag |= IN_CHANGE | IN_UPDATE;
160 *bnp = bno;
161 return (0);
162 }
163 #if QUOTA
164 /*
165 * Restore user's disk quota because allocation failed.
166 */
167 (void) chkdq(ip, (int64_t)-size, cred, FORCE);
168 #endif /* QUOTA */
169 nospace:
170 ffs_fserr(fs, kauth_cred_getuid(cred), "file system full");
171 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
172 return (ENOSPC);
173 }
174
175 /*
176 * Reallocate a fragment to a bigger size
177 *
178 * The number and size of the old block is given, and a preference
179 * and new size is also specified. The allocator attempts to extend
180 * the original block. Failing that, the regular block allocator is
181 * invoked to get an appropriate block.
182 */
183 ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
184 register struct inode *ip;
185 ufs_daddr_t lbprev;
186 ufs_daddr_t bpref;
187 int osize, nsize;
188 kauth_cred_t cred;
189 struct buf **bpp;
190 {
191 register struct fs *fs;
192 struct buf *bp;
193 int cg, request, error;
194 ufs_daddr_t bprev, bno;
195 int devBlockSize=0;
196
197 *bpp = 0;
198 fs = ip->i_fs;
199 #if DIAGNOSTIC
200 if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
201 (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
202 printf(
203 "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
204 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
205 panic("ffs_realloccg: bad size");
206 }
207 if (cred == NOCRED)
208 panic("ffs_realloccg: missing credential\n");
209 #endif /* DIAGNOSTIC */
210 if (suser(cred, NULL) != 0 && freespace(fs, fs->fs_minfree) <= 0)
211 goto nospace;
212 if ((bprev = ip->i_db[lbprev]) == 0) {
213 printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n",
214 ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt);
215 panic("ffs_realloccg: bad bprev");
216 }
217 /*
218 * Allocate the extra space in the buffer.
219 */
220 if (error = (int)buf_bread(ITOV(ip), (daddr64_t)((unsigned)lbprev), osize, NOCRED, &bp)) {
221 buf_brelse(bp);
222 return (error);
223 }
224 devBlockSize = vfs_devblocksize(vnode_mount(ITOV(ip)));
225
226 #if QUOTA
227 if (error = chkdq(ip, (int64_t)(nsize - osize), cred, 0))
228 {
229 buf_brelse(bp);
230 return (error);
231 }
232 #endif /* QUOTA */
233 /*
234 * Check for extension in the existing location.
235 */
236 cg = dtog(fs, bprev);
237 if (bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) {
238 if ((ufs_daddr_t)buf_blkno(bp) != fsbtodb(fs, bno))
239 panic("bad blockno");
240 ip->i_blocks += btodb(nsize - osize, devBlockSize);
241 ip->i_flag |= IN_CHANGE | IN_UPDATE;
242 allocbuf(bp, nsize);
243 buf_setflags(bp, B_DONE);
244 bzero((char *)buf_dataptr(bp) + osize, (u_int)buf_size(bp) - osize);
245 *bpp = bp;
246 return (0);
247 }
248 /*
249 * Allocate a new disk location.
250 */
251 if (bpref >= fs->fs_size)
252 bpref = 0;
253 switch ((int)fs->fs_optim) {
254 case FS_OPTSPACE:
255 /*
256 * Allocate an exact sized fragment. Although this makes
257 * best use of space, we will waste time relocating it if
258 * the file continues to grow. If the fragmentation is
259 * less than half of the minimum free reserve, we choose
260 * to begin optimizing for time.
261 */
262 request = nsize;
263 if (fs->fs_minfree < 5 ||
264 fs->fs_cstotal.cs_nffree >
265 fs->fs_dsize * fs->fs_minfree / (2 * 100))
266 break;
267 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
268 fs->fs_fsmnt);
269 fs->fs_optim = FS_OPTTIME;
270 break;
271 case FS_OPTTIME:
272 /*
273 * At this point we have discovered a file that is trying to
274 * grow a small fragment to a larger fragment. To save time,
275 * we allocate a full sized block, then free the unused portion.
276 * If the file continues to grow, the `ffs_fragextend' call
277 * above will be able to grow it in place without further
278 * copying. If aberrant programs cause disk fragmentation to
279 * grow within 2% of the free reserve, we choose to begin
280 * optimizing for space.
281 */
282 request = fs->fs_bsize;
283 if (fs->fs_cstotal.cs_nffree <
284 fs->fs_dsize * (fs->fs_minfree - 2) / 100)
285 break;
286 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
287 fs->fs_fsmnt);
288 fs->fs_optim = FS_OPTSPACE;
289 break;
290 default:
291 printf("dev = 0x%x, optim = %d, fs = %s\n",
292 ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
293 panic("ffs_realloccg: bad optim");
294 /* NOTREACHED */
295 }
296 bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
297 (u_int32_t (*)())ffs_alloccg);
298 if (bno > 0) {
299 buf_setblkno(bp, (daddr64_t)((unsigned)fsbtodb(fs, bno)));
300 ffs_blkfree(ip, bprev, (long)osize);
301 if (nsize < request)
302 ffs_blkfree(ip, bno + numfrags(fs, nsize),
303 (long)(request - nsize));
304 ip->i_blocks += btodb(nsize - osize, devBlockSize);
305 ip->i_flag |= IN_CHANGE | IN_UPDATE;
306 allocbuf(bp, nsize);
307 buf_setflags(bp, B_DONE);
308 bzero((char *)buf_dataptr(bp) + osize, (u_int)buf_size(bp) - osize);
309 *bpp = bp;
310 return (0);
311 }
312 #if QUOTA
313 /*
314 * Restore user's disk quota because allocation failed.
315 */
316 (void) chkdq(ip, (int64_t)-(nsize - osize), cred, FORCE);
317 #endif /* QUOTA */
318 buf_brelse(bp);
319 nospace:
320 /*
321 * no space available
322 */
323 ffs_fserr(fs, kauth_cred_getuid(cred), "file system full");
324 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
325 return (ENOSPC);
326 }
327
328 /*
329 * Reallocate a sequence of blocks into a contiguous sequence of blocks.
330 *
331 * The vnode and an array of buffer pointers for a range of sequential
332 * logical blocks to be made contiguous is given. The allocator attempts
333 * to find a range of sequential blocks starting as close as possible to
334 * an fs_rotdelay offset from the end of the allocation for the logical
335 * block immediately preceeding the current range. If successful, the
336 * physical block numbers in the buffer pointers and in the inode are
337 * changed to reflect the new allocation. If unsuccessful, the allocation
338 * is left unchanged. The success in doing the reallocation is returned.
339 * Note that the error return is not reflected back to the user. Rather
340 * the previous block allocation will be used.
341 */
342 /* Note: This routine is unused in UBC cluster I/O */
343
344 int doasyncfree = 1;
345 int doreallocblks = 1;
346
347
348 /*
349 * Allocate an inode in the file system.
350 *
351 * If allocating a directory, use ffs_dirpref to select the inode.
352 * If allocating in a directory, the following hierarchy is followed:
353 * 1) allocate the preferred inode.
354 * 2) allocate an inode in the same cylinder group.
355 * 3) quadradically rehash into other cylinder groups, until an
356 * available inode is located.
357 * If no inode preference is given the following heirarchy is used
358 * to allocate an inode:
359 * 1) allocate an inode in cylinder group 0.
360 * 2) quadradically rehash into other cylinder groups, until an
361 * available inode is located.
362 */
363 int
364 ffs_valloc(
365 struct vnode *pvp,
366 mode_t mode,
367 kauth_cred_t cred,
368 struct vnode **vpp)
369
370 {
371 register struct inode *pip;
372 register struct fs *fs;
373 register struct inode *ip;
374 struct timeval tv;
375 ino_t ino, ipref;
376 int cg, error;
377
378 *vpp = NULL;
379 pip = VTOI(pvp);
380 fs = pip->i_fs;
381 if (fs->fs_cstotal.cs_nifree == 0)
382 goto noinodes;
383
384 if ((mode & IFMT) == IFDIR)
385 ipref = ffs_dirpref(pip);
386 else
387 ipref = pip->i_number;
388 if (ipref >= fs->fs_ncg * fs->fs_ipg)
389 ipref = 0;
390 cg = ino_to_cg(fs, ipref);
391 /*
392 * Track the number of dirs created one after another
393 * in a cg without intervening files.
394 */
395 if ((mode & IFMT) == IFDIR) {
396 if (fs->fs_contigdirs[cg] < 255)
397 fs->fs_contigdirs[cg]++;
398 } else {
399 if (fs->fs_contigdirs[cg] > 0)
400 fs->fs_contigdirs[cg]--;
401 }
402 ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg);
403 if (ino == 0)
404 goto noinodes;
405
406 error = ffs_vget_internal(pvp->v_mount, ino, vpp, NULL, NULL, mode, 0);
407 if (error) {
408 ffs_vfree(pvp, ino, mode);
409 return (error);
410 }
411 ip = VTOI(*vpp);
412
413 if (ip->i_mode) {
414 printf("mode = 0%o, inum = %d, fs = %s\n",
415 ip->i_mode, ip->i_number, fs->fs_fsmnt);
416 panic("ffs_valloc: dup alloc");
417 }
418 if (ip->i_blocks) { /* XXX */
419 printf("free inode %s/%d had %d blocks\n",
420 fs->fs_fsmnt, ino, ip->i_blocks);
421 ip->i_blocks = 0;
422 }
423 ip->i_flags = 0;
424 /*
425 * Set up a new generation number for this inode.
426 */
427 microtime(&tv);
428 if (++nextgennumber < (u_long)tv.tv_sec)
429 nextgennumber = tv.tv_sec;
430 ip->i_gen = nextgennumber;
431 return (0);
432 noinodes:
433 ffs_fserr(fs, kauth_cred_getuid(cred), "out of inodes");
434 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
435 return (ENOSPC);
436 }
437
438 /*
439 * Find a cylinder group to place a directory.
440 *
441 * The policy implemented by this algorithm is to allocate a
442 * directory inode in the same cylinder group as its parent
443 * directory, but also to reserve space for its files inodes
444 * and data. Restrict the number of directories which may be
445 * allocated one after another in the same cylinder group
446 * without intervening allocation of files.
447 */
448 static ino_t
449 ffs_dirpref(pip)
450 struct inode *pip;
451 {
452 register struct fs *fs;
453 int cg, prefcg, dirsize, cgsize;
454 int avgifree, avgbfree, avgndir, curdirsize;
455 int minifree, minbfree, maxndir;
456 int mincg, minndir;
457 int maxcontigdirs;
458
459 fs = pip->i_fs;
460 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
461 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
462 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
463
464 /*
465 * Force allocation in another cg if creating a first level dir.
466 */
467 if (ITOV(pip)->v_flag & VROOT) {
468 #ifdef __APPLE__
469 prefcg = random() % fs->fs_ncg;
470 #else
471 prefcg = arc4random() % fs->fs_ncg;
472 #endif
473 mincg = prefcg;
474 minndir = fs->fs_ipg;
475 for (cg = prefcg; cg < fs->fs_ncg; cg++)
476 if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
477 fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
478 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
479 mincg = cg;
480 minndir = fs->fs_cs(fs, cg).cs_ndir;
481 }
482 for (cg = 0; cg < prefcg; cg++)
483 if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
484 fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
485 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
486 mincg = cg;
487 minndir = fs->fs_cs(fs, cg).cs_ndir;
488 }
489 return ((ino_t)(fs->fs_ipg * mincg));
490 }
491
492 /*
493 * Count various limits which used for
494 * optimal allocation of a directory inode.
495 */
496 maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
497 minifree = avgifree - fs->fs_ipg / 4;
498 if (minifree < 0)
499 minifree = 0;
500 minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4;
501 if (minbfree < 0)
502 minbfree = 0;
503 cgsize = fs->fs_fsize * fs->fs_fpg;
504 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
505 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
506 if (dirsize < curdirsize)
507 dirsize = curdirsize;
508 maxcontigdirs = min(cgsize / dirsize, 255);
509 if (fs->fs_avgfpdir > 0)
510 maxcontigdirs = min(maxcontigdirs,
511 fs->fs_ipg / fs->fs_avgfpdir);
512 if (maxcontigdirs == 0)
513 maxcontigdirs = 1;
514
515 /*
516 * Limit number of dirs in one cg and reserve space for
517 * regular files, but only if we have no deficit in
518 * inodes or space.
519 */
520 prefcg = ino_to_cg(fs, pip->i_number);
521 for (cg = prefcg; cg < fs->fs_ncg; cg++)
522 if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
523 fs->fs_cs(fs, cg).cs_nifree >= minifree &&
524 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
525 if (fs->fs_contigdirs[cg] < maxcontigdirs)
526 return ((ino_t)(fs->fs_ipg * cg));
527 }
528 for (cg = 0; cg < prefcg; cg++)
529 if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
530 fs->fs_cs(fs, cg).cs_nifree >= minifree &&
531 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
532 if (fs->fs_contigdirs[cg] < maxcontigdirs)
533 return ((ino_t)(fs->fs_ipg * cg));
534 }
535 /*
536 * This is a backstop when we have deficit in space.
537 */
538 for (cg = prefcg; cg < fs->fs_ncg; cg++)
539 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
540 return ((ino_t)(fs->fs_ipg * cg));
541 for (cg = 0; cg < prefcg; cg++)
542 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
543 break;
544 return ((ino_t)(fs->fs_ipg * cg));
545 }
546
547 /*
548 * Select the desired position for the next block in a file. The file is
549 * logically divided into sections. The first section is composed of the
550 * direct blocks. Each additional section contains fs_maxbpg blocks.
551 *
552 * If no blocks have been allocated in the first section, the policy is to
553 * request a block in the same cylinder group as the inode that describes
554 * the file. If no blocks have been allocated in any other section, the
555 * policy is to place the section in a cylinder group with a greater than
556 * average number of free blocks. An appropriate cylinder group is found
557 * by using a rotor that sweeps the cylinder groups. When a new group of
558 * blocks is needed, the sweep begins in the cylinder group following the
559 * cylinder group from which the previous allocation was made. The sweep
560 * continues until a cylinder group with greater than the average number
561 * of free blocks is found. If the allocation is for the first block in an
562 * indirect block, the information on the previous allocation is unavailable;
563 * here a best guess is made based upon the logical block number being
564 * allocated.
565 *
566 * If a section is already partially allocated, the policy is to
567 * contiguously allocate fs_maxcontig blocks. The end of one of these
568 * contiguous blocks and the beginning of the next is physically separated
569 * so that the disk head will be in transit between them for at least
570 * fs_rotdelay milliseconds. This is to allow time for the processor to
571 * schedule another I/O transfer.
572 */
573 ufs_daddr_t
574 ffs_blkpref(ip, lbn, indx, bap)
575 struct inode *ip;
576 ufs_daddr_t lbn;
577 int indx;
578 ufs_daddr_t *bap;
579 {
580 register struct fs *fs;
581 register int cg;
582 int avgbfree, startcg;
583 ufs_daddr_t nextblk;
584 #if REV_ENDIAN_FS
585 daddr_t prev=0;
586 struct vnode *vp=ITOV(ip);
587 struct mount *mp=vp->v_mount;
588 int rev_endian=(mp->mnt_flag & MNT_REVEND);
589 #endif /* REV_ENDIAN_FS */
590
591 fs = ip->i_fs;
592 #if REV_ENDIAN_FS
593 if (indx && bap) {
594 if (rev_endian) {
595 if (bap != &ip->i_db[0])
596 prev = NXSwapLong(bap[indx - 1]);
597 else
598 prev = bap[indx - 1];
599 } else prev = bap[indx - 1];
600 }
601 if (indx % fs->fs_maxbpg == 0 || prev == 0)
602 #else /* REV_ENDIAN_FS */
603 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0)
604 #endif /* REV_ENDIAN_FS */
605 {
606 if (lbn < NDADDR) {
607 cg = ino_to_cg(fs, ip->i_number);
608 return (fs->fs_fpg * cg + fs->fs_frag);
609 }
610 /*
611 * Find a cylinder with greater than average number of
612 * unused data blocks.
613 */
614 #if REV_ENDIAN_FS
615 if (indx == 0 || prev == 0)
616 #else /* REV_ENDIAN_FS */
617 if (indx == 0 || bap[indx - 1] == 0)
618 #endif /* REV_ENDIAN_FS */
619 startcg =
620 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
621 else
622 #if REV_ENDIAN_FS
623 startcg = dtog(fs, prev) + 1;
624 #else /* REV_ENDIAN_FS */
625 startcg = dtog(fs, bap[indx - 1]) + 1;
626 #endif /* REV_ENDIAN_FS */
627 startcg %= fs->fs_ncg;
628 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
629 for (cg = startcg; cg < fs->fs_ncg; cg++)
630 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
631 fs->fs_cgrotor = cg;
632 return (fs->fs_fpg * cg + fs->fs_frag);
633 }
634 for (cg = 0; cg <= startcg; cg++)
635 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
636 fs->fs_cgrotor = cg;
637 return (fs->fs_fpg * cg + fs->fs_frag);
638 }
639 return (NULL);
640 }
641 /*
642 * One or more previous blocks have been laid out. If less
643 * than fs_maxcontig previous blocks are contiguous, the
644 * next block is requested contiguously, otherwise it is
645 * requested rotationally delayed by fs_rotdelay milliseconds.
646 */
647 #if REV_ENDIAN_FS
648 if (rev_endian) {
649 nextblk = prev + fs->fs_frag;
650 if (indx < fs->fs_maxcontig) {
651 return (nextblk);
652 }
653 if (bap != &ip->i_db[0])
654 prev = NXSwapLong(bap[indx - fs->fs_maxcontig]);
655 else
656 prev = bap[indx - fs->fs_maxcontig];
657 if (prev + blkstofrags(fs, fs->fs_maxcontig) != nextblk)
658 return (nextblk);
659 } else {
660 #endif /* REV_ENDIAN_FS */
661 nextblk = bap[indx - 1] + fs->fs_frag;
662 if (indx < fs->fs_maxcontig || bap[indx - fs->fs_maxcontig] +
663 blkstofrags(fs, fs->fs_maxcontig) != nextblk)
664 return (nextblk);
665 #if REV_ENDIAN_FS
666 }
667 #endif /* REV_ENDIAN_FS */
668 if (fs->fs_rotdelay != 0)
669 /*
670 * Here we convert ms of delay to frags as:
671 * (frags) = (ms) * (rev/sec) * (sect/rev) /
672 * ((sect/frag) * (ms/sec))
673 * then round up to the next block.
674 */
675 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
676 (NSPF(fs) * 1000), fs->fs_frag);
677 return (nextblk);
678 }
679
680 /*
681 * Implement the cylinder overflow algorithm.
682 *
683 * The policy implemented by this algorithm is:
684 * 1) allocate the block in its requested cylinder group.
685 * 2) quadradically rehash on the cylinder group number.
686 * 3) brute force search for a free block.
687 */
688 /*VARARGS5*/
689 static u_long
690 ffs_hashalloc(ip, cg, pref, size, allocator)
691 struct inode *ip;
692 int cg;
693 long pref;
694 int size; /* size for data blocks, mode for inodes */
695 u_int32_t (*allocator)();
696 {
697 register struct fs *fs;
698 long result;
699 int i, icg = cg;
700
701 fs = ip->i_fs;
702 /*
703 * 1: preferred cylinder group
704 */
705 result = (*allocator)(ip, cg, pref, size);
706 if (result)
707 return (result);
708 /*
709 * 2: quadratic rehash
710 */
711 for (i = 1; i < fs->fs_ncg; i *= 2) {
712 cg += i;
713 if (cg >= fs->fs_ncg)
714 cg -= fs->fs_ncg;
715 result = (*allocator)(ip, cg, 0, size);
716 if (result)
717 return (result);
718 }
719 /*
720 * 3: brute force search
721 * Note that we start at i == 2, since 0 was checked initially,
722 * and 1 is always checked in the quadratic rehash.
723 */
724 cg = (icg + 2) % fs->fs_ncg;
725 for (i = 2; i < fs->fs_ncg; i++) {
726 result = (*allocator)(ip, cg, 0, size);
727 if (result)
728 return (result);
729 cg++;
730 if (cg == fs->fs_ncg)
731 cg = 0;
732 }
733 return (NULL);
734 }
735
736 /*
737 * Determine whether a fragment can be extended.
738 *
739 * Check to see if the necessary fragments are available, and
740 * if they are, allocate them.
741 */
742 static ufs_daddr_t
743 ffs_fragextend(ip, cg, bprev, osize, nsize)
744 struct inode *ip;
745 int cg;
746 long bprev;
747 int osize, nsize;
748 {
749 register struct fs *fs;
750 register struct cg *cgp;
751 struct buf *bp;
752 struct timeval tv;
753 long bno;
754 int frags, bbase;
755 int i, error;
756 #if REV_ENDIAN_FS
757 struct vnode *vp=ITOV(ip);
758 struct mount *mp=vp->v_mount;
759 int rev_endian=(mp->mnt_flag & MNT_REVEND);
760 #endif /* REV_ENDIAN_FS */
761
762 fs = ip->i_fs;
763 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
764 return (NULL);
765 frags = numfrags(fs, nsize); /* number of fragments needed */
766 bbase = fragnum(fs, bprev); /* offset in a frag (it is mod fragsize */
767 if (bbase > fragnum(fs, (bprev + frags - 1))) {
768 /* cannot extend across a block boundary */
769 return (NULL);
770 }
771 /* read corresponding cylinder group info */
772 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
773 (int)fs->fs_cgsize, NOCRED, &bp);
774 if (error) {
775 buf_brelse(bp);
776 return (NULL);
777 }
778 cgp = (struct cg *)buf_dataptr(bp);
779 #if REV_ENDIAN_FS
780 if (rev_endian) {
781 byte_swap_cgin(cgp, fs);
782 }
783 #endif /* REV_ENDIAN_FS */
784
785 if (!cg_chkmagic(cgp)) {
786 #if REV_ENDIAN_FS
787 if (rev_endian)
788 byte_swap_cgout(cgp,fs);
789 #endif /* REV_ENDIAN_FS */
790 buf_brelse(bp);
791 return (NULL);
792 }
793 microtime(&tv);
794 cgp->cg_time = tv.tv_sec;
795 bno = dtogd(fs, bprev);
796 for (i = numfrags(fs, osize); i < frags; i++)
797 if (isclr(cg_blksfree(cgp), bno + i)) {
798 #if REV_ENDIAN_FS
799 if (rev_endian)
800 byte_swap_cgout(cgp,fs);
801 #endif /* REV_ENDIAN_FS */
802 buf_brelse(bp);
803 return (NULL);
804 }
805 /*
806 * the current fragment can be extended
807 * deduct the count on fragment being extended into
808 * increase the count on the remaining fragment (if any)
809 * allocate the extended piece
810 */
811 for (i = frags; i < fs->fs_frag - bbase; i++)
812 if (isclr(cg_blksfree(cgp), bno + i))
813 break;
814 cgp->cg_frsum[i - numfrags(fs, osize)]--;
815 if (i != frags)
816 cgp->cg_frsum[i - frags]++;
817 for (i = numfrags(fs, osize); i < frags; i++) {
818 clrbit(cg_blksfree(cgp), bno + i);
819 cgp->cg_cs.cs_nffree--;
820 fs->fs_cstotal.cs_nffree--;
821 fs->fs_cs(fs, cg).cs_nffree--;
822 }
823 fs->fs_fmod = 1;
824 #if REV_ENDIAN_FS
825 if (rev_endian)
826 byte_swap_cgout(cgp,fs);
827 #endif /* REV_ENDIAN_FS */
828 buf_bdwrite(bp);
829 return (bprev);
830 }
831
832 /*
833 * Determine whether a block can be allocated.
834 *
835 * Check to see if a block of the appropriate size is available,
836 * and if it is, allocate it.
837 */
838 static ufs_daddr_t
839 ffs_alloccg(ip, cg, bpref, size)
840 struct inode *ip;
841 int cg;
842 ufs_daddr_t bpref;
843 int size;
844 {
845 register struct fs *fs;
846 register struct cg *cgp;
847 struct buf *bp;
848 struct timeval tv;
849 register int i;
850 int error, bno, frags, allocsiz;
851 #if REV_ENDIAN_FS
852 struct vnode *vp=ITOV(ip);
853 struct mount *mp=vp->v_mount;
854 int rev_endian=(mp->mnt_flag & MNT_REVEND);
855 #endif /* REV_ENDIAN_FS */
856
857 fs = ip->i_fs;
858 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
859 return (NULL);
860 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
861 (int)fs->fs_cgsize, NOCRED, &bp);
862 if (error) {
863 buf_brelse(bp);
864 return (NULL);
865 }
866 cgp = (struct cg *)buf_dataptr(bp);
867 #if REV_ENDIAN_FS
868 if (rev_endian)
869 byte_swap_cgin(cgp,fs);
870 #endif /* REV_ENDIAN_FS */
871 if (!cg_chkmagic(cgp) ||
872 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
873 #if REV_ENDIAN_FS
874 if (rev_endian)
875 byte_swap_cgout(cgp,fs);
876 #endif /* REV_ENDIAN_FS */
877 buf_brelse(bp);
878 return (NULL);
879 }
880 microtime(&tv);
881 cgp->cg_time = tv.tv_sec;
882 if (size == fs->fs_bsize) {
883 bno = ffs_alloccgblk(fs, cgp, bpref);
884 #if REV_ENDIAN_FS
885 if (rev_endian)
886 byte_swap_cgout(cgp,fs);
887 #endif /* REV_ENDIAN_FS */
888 buf_bdwrite(bp);
889 return (bno);
890 }
891 /*
892 * check to see if any fragments are already available
893 * allocsiz is the size which will be allocated, hacking
894 * it down to a smaller size if necessary
895 */
896 frags = numfrags(fs, size);
897 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
898 if (cgp->cg_frsum[allocsiz] != 0)
899 break;
900 if (allocsiz == fs->fs_frag) {
901 /*
902 * no fragments were available, so a block will be
903 * allocated, and hacked up
904 */
905 if (cgp->cg_cs.cs_nbfree == 0) {
906 #if REV_ENDIAN_FS
907 if (rev_endian)
908 byte_swap_cgout(cgp,fs);
909 #endif /* REV_ENDIAN_FS */
910 buf_brelse(bp);
911 return (NULL);
912 }
913 bno = ffs_alloccgblk(fs, cgp, bpref);
914 bpref = dtogd(fs, bno);
915 for (i = frags; i < fs->fs_frag; i++)
916 setbit(cg_blksfree(cgp), bpref + i);
917 i = fs->fs_frag - frags;
918 cgp->cg_cs.cs_nffree += i;
919 fs->fs_cstotal.cs_nffree += i;
920 fs->fs_cs(fs, cg).cs_nffree += i;
921 fs->fs_fmod = 1;
922 cgp->cg_frsum[i]++;
923 #if REV_ENDIAN_FS
924 if (rev_endian)
925 byte_swap_cgout(cgp,fs);
926 #endif /* REV_ENDIAN_FS */
927 buf_bdwrite(bp);
928 return (bno);
929 }
930 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
931 if (bno < 0) {
932 #if REV_ENDIAN_FS
933 if (rev_endian)
934 byte_swap_cgout(cgp,fs);
935 #endif /* REV_ENDIAN_FS */
936 buf_brelse(bp);
937 return (NULL);
938 }
939 for (i = 0; i < frags; i++)
940 clrbit(cg_blksfree(cgp), bno + i);
941 cgp->cg_cs.cs_nffree -= frags;
942 fs->fs_cstotal.cs_nffree -= frags;
943 fs->fs_cs(fs, cg).cs_nffree -= frags;
944 fs->fs_fmod = 1;
945 cgp->cg_frsum[allocsiz]--;
946 if (frags != allocsiz)
947 cgp->cg_frsum[allocsiz - frags]++;
948 #if REV_ENDIAN_FS
949 if (rev_endian)
950 byte_swap_cgout(cgp,fs);
951 #endif /* REV_ENDIAN_FS */
952 buf_bdwrite(bp);
953 return (cg * fs->fs_fpg + bno);
954 }
955
956 /*
957 * Allocate a block in a cylinder group.
958 *
959 * This algorithm implements the following policy:
960 * 1) allocate the requested block.
961 * 2) allocate a rotationally optimal block in the same cylinder.
962 * 3) allocate the next available block on the block rotor for the
963 * specified cylinder group.
964 * Note that this routine only allocates fs_bsize blocks; these
965 * blocks may be fragmented by the routine that allocates them.
966 */
967 static ufs_daddr_t
968 ffs_alloccgblk(fs, cgp, bpref)
969 register struct fs *fs;
970 register struct cg *cgp;
971 ufs_daddr_t bpref;
972 {
973 ufs_daddr_t bno, blkno;
974 int cylno, pos, delta;
975 short *cylbp;
976 register int i;
977
978 if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
979 bpref = cgp->cg_rotor;
980 goto norot;
981 }
982 bpref = blknum(fs, bpref);
983 bpref = dtogd(fs, bpref);
984 /*
985 * if the requested block is available, use it
986 */
987 if (ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bpref))) {
988 bno = bpref;
989 goto gotit;
990 }
991 if (fs->fs_nrpos <= 1 || fs->fs_cpc == 0) {
992 /*
993 * Block layout information is not available.
994 * Leaving bpref unchanged means we take the
995 * next available free block following the one
996 * we just allocated. Hopefully this will at
997 * least hit a track cache on drives of unknown
998 * geometry (e.g. SCSI).
999 */
1000 goto norot;
1001 }
1002 /*
1003 * check for a block available on the same cylinder
1004 */
1005 cylno = cbtocylno(fs, bpref);
1006 if (cg_blktot(cgp)[cylno] == 0)
1007 goto norot;
1008 /*
1009 * check the summary information to see if a block is
1010 * available in the requested cylinder starting at the
1011 * requested rotational position and proceeding around.
1012 */
1013 cylbp = cg_blks(fs, cgp, cylno);
1014 pos = cbtorpos(fs, bpref);
1015 for (i = pos; i < fs->fs_nrpos; i++)
1016 if (cylbp[i] > 0)
1017 break;
1018 if (i == fs->fs_nrpos)
1019 for (i = 0; i < pos; i++)
1020 if (cylbp[i] > 0)
1021 break;
1022 if (cylbp[i] > 0) {
1023 /*
1024 * found a rotational position, now find the actual
1025 * block. A panic if none is actually there.
1026 */
1027 pos = cylno % fs->fs_cpc;
1028 bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1029 if (fs_postbl(fs, pos)[i] == -1) {
1030 printf("pos = %d, i = %d, fs = %s\n",
1031 pos, i, fs->fs_fsmnt);
1032 panic("ffs_alloccgblk: cyl groups corrupted");
1033 }
1034 for (i = fs_postbl(fs, pos)[i];; ) {
1035 if (ffs_isblock(fs, cg_blksfree(cgp), bno + i)) {
1036 bno = blkstofrags(fs, (bno + i));
1037 goto gotit;
1038 }
1039 delta = fs_rotbl(fs)[i];
1040 if (delta <= 0 ||
1041 delta + i > fragstoblks(fs, fs->fs_fpg))
1042 break;
1043 i += delta;
1044 }
1045 printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1046 panic("ffs_alloccgblk: can't find blk in cyl");
1047 }
1048 norot:
1049 /*
1050 * no blocks in the requested cylinder, so take next
1051 * available one in this cylinder group.
1052 */
1053 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1054 if (bno < 0)
1055 return (NULL);
1056 cgp->cg_rotor = bno;
1057 gotit:
1058 blkno = fragstoblks(fs, bno);
1059 ffs_clrblock(fs, cg_blksfree(cgp), (long)blkno);
1060 ffs_clusteracct(fs, cgp, blkno, -1);
1061 cgp->cg_cs.cs_nbfree--;
1062 fs->fs_cstotal.cs_nbfree--;
1063 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1064 cylno = cbtocylno(fs, bno);
1065 cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--;
1066 cg_blktot(cgp)[cylno]--;
1067 fs->fs_fmod = 1;
1068 return (cgp->cg_cgx * fs->fs_fpg + bno);
1069 }
1070
1071 /*
1072 * Determine whether a cluster can be allocated.
1073 *
1074 * We do not currently check for optimal rotational layout if there
1075 * are multiple choices in the same cylinder group. Instead we just
1076 * take the first one that we find following bpref.
1077 */
1078 static ufs_daddr_t
1079 ffs_clusteralloc(ip, cg, bpref, len)
1080 struct inode *ip;
1081 int cg;
1082 ufs_daddr_t bpref;
1083 int len;
1084 {
1085 register struct fs *fs;
1086 register struct cg *cgp;
1087 struct buf *bp;
1088 int i, got, run, bno, bit, map;
1089 u_char *mapp;
1090 int32_t *lp;
1091 #if REV_ENDIAN_FS
1092 struct vnode *vp=ITOV(ip);
1093 struct mount *mp=vp->v_mount;
1094 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1095 #endif /* REV_ENDIAN_FS */
1096
1097 fs = ip->i_fs;
1098 if (fs->fs_maxcluster[cg] < len)
1099 return (NULL);
1100 if (buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), (int)fs->fs_cgsize,
1101 NOCRED, &bp))
1102 goto fail;
1103 cgp = (struct cg *)buf_dataptr(bp);
1104 #if REV_ENDIAN_FS
1105 if (rev_endian)
1106 byte_swap_cgin(cgp,fs);
1107 #endif /* REV_ENDIAN_FS */
1108 if (!cg_chkmagic(cgp)) {
1109 #if REV_ENDIAN_FS
1110 if (rev_endian)
1111 byte_swap_cgout(cgp,fs);
1112 #endif /* REV_ENDIAN_FS */
1113 goto fail;
1114 }
1115 /*
1116 * Check to see if a cluster of the needed size (or bigger) is
1117 * available in this cylinder group.
1118 */
1119 lp = &cg_clustersum(cgp)[len];
1120 for (i = len; i <= fs->fs_contigsumsize; i++)
1121 if (*lp++ > 0)
1122 break;
1123 if (i > fs->fs_contigsumsize) {
1124 /*
1125 * This is the first time looking for a cluster in this
1126 * cylinder group. Update the cluster summary information
1127 * to reflect the true maximum sized cluster so that
1128 * future cluster allocation requests can avoid reading
1129 * the cylinder group map only to find no clusters.
1130 */
1131 lp = &cg_clustersum(cgp)[len - 1];
1132 for (i = len - 1; i > 0; i--)
1133 if (*lp-- > 0)
1134 break;
1135 fs->fs_maxcluster[cg] = i;
1136 #if REV_ENDIAN_FS
1137 if (rev_endian)
1138 byte_swap_cgout(cgp,fs);
1139 #endif /* REV_ENDIAN_FS */
1140 goto fail;
1141 }
1142 /*
1143 * Search the cluster map to find a big enough cluster.
1144 * We take the first one that we find, even if it is larger
1145 * than we need as we prefer to get one close to the previous
1146 * block allocation. We do not search before the current
1147 * preference point as we do not want to allocate a block
1148 * that is allocated before the previous one (as we will
1149 * then have to wait for another pass of the elevator
1150 * algorithm before it will be read). We prefer to fail and
1151 * be recalled to try an allocation in the next cylinder group.
1152 */
1153 if (dtog(fs, bpref) != cg)
1154 bpref = 0;
1155 else
1156 bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
1157 mapp = &cg_clustersfree(cgp)[bpref / NBBY];
1158 map = *mapp++;
1159 bit = 1 << (bpref % NBBY);
1160 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
1161 if ((map & bit) == 0) {
1162 run = 0;
1163 } else {
1164 run++;
1165 if (run == len)
1166 break;
1167 }
1168 if ((got & (NBBY - 1)) != (NBBY - 1)) {
1169 bit <<= 1;
1170 } else {
1171 map = *mapp++;
1172 bit = 1;
1173 }
1174 }
1175 if (got == cgp->cg_nclusterblks) {
1176 #if REV_ENDIAN_FS
1177 if (rev_endian)
1178 byte_swap_cgout(cgp,fs);
1179 #endif /* REV_ENDIAN_FS */
1180 goto fail;
1181 }
1182 /*
1183 * Allocate the cluster that we have found.
1184 */
1185 for (i = 1; i <= len; i++)
1186 if (!ffs_isblock(fs, cg_blksfree(cgp), got - run + i))
1187 panic("ffs_clusteralloc: map mismatch");
1188 bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1);
1189 if (dtog(fs, bno) != cg)
1190 panic("ffs_clusteralloc: allocated out of group");
1191 len = blkstofrags(fs, len);
1192 for (i = 0; i < len; i += fs->fs_frag)
1193 if ((got = ffs_alloccgblk(fs, cgp, bno + i)) != bno + i)
1194 panic("ffs_clusteralloc: lost block");
1195 #if REV_ENDIAN_FS
1196 if (rev_endian)
1197 byte_swap_cgout(cgp,fs);
1198 #endif /* REV_ENDIAN_FS */
1199 buf_bdwrite(bp);
1200 return (bno);
1201
1202 fail:
1203 buf_brelse(bp);
1204 return (0);
1205 }
1206
1207 /*
1208 * Determine whether an inode can be allocated.
1209 *
1210 * Check to see if an inode is available, and if it is,
1211 * allocate it using the following policy:
1212 * 1) allocate the requested inode.
1213 * 2) allocate the next available inode after the requested
1214 * inode in the specified cylinder group.
1215 */
1216 static ino_t
1217 ffs_nodealloccg(ip, cg, ipref, mode)
1218 struct inode *ip;
1219 int cg;
1220 ufs_daddr_t ipref;
1221 int mode;
1222 {
1223 register struct fs *fs;
1224 register struct cg *cgp;
1225 struct buf *bp;
1226 struct timeval tv;
1227 int error, start, len, loc, map, i;
1228 #if REV_ENDIAN_FS
1229 struct vnode *vp=ITOV(ip);
1230 struct mount *mp=vp->v_mount;
1231 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1232 #endif /* REV_ENDIAN_FS */
1233
1234 fs = ip->i_fs;
1235 if (fs->fs_cs(fs, cg).cs_nifree == 0)
1236 return (NULL);
1237 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
1238 (int)fs->fs_cgsize, NOCRED, &bp);
1239 if (error) {
1240 buf_brelse(bp);
1241 return (NULL);
1242 }
1243 cgp = (struct cg *)buf_dataptr(bp);
1244 #if REV_ENDIAN_FS
1245 if (rev_endian)
1246 byte_swap_cgin(cgp,fs);
1247 #endif /* REV_ENDIAN_FS */
1248 if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
1249 #if REV_ENDIAN_FS
1250 if (rev_endian)
1251 byte_swap_cgout(cgp,fs);
1252 #endif /* REV_ENDIAN_FS */
1253 buf_brelse(bp);
1254 return (NULL);
1255 }
1256
1257 microtime(&tv);
1258 cgp->cg_time = tv.tv_sec;
1259 if (ipref) {
1260 ipref %= fs->fs_ipg;
1261 if (isclr(cg_inosused(cgp), ipref))
1262 goto gotit;
1263 }
1264 start = cgp->cg_irotor / NBBY;
1265 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1266 loc = skpc(0xff, len, &cg_inosused(cgp)[start]);
1267 if (loc == 0) {
1268 len = start + 1;
1269 start = 0;
1270 loc = skpc(0xff, len, &cg_inosused(cgp)[0]);
1271 if (loc == 0) {
1272 printf("cg = %d, irotor = %d, fs = %s\n",
1273 cg, cgp->cg_irotor, fs->fs_fsmnt);
1274 panic("ffs_nodealloccg: map corrupted");
1275 /* NOTREACHED */
1276 }
1277 }
1278 i = start + len - loc;
1279 map = cg_inosused(cgp)[i];
1280 ipref = i * NBBY;
1281 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1282 if ((map & i) == 0) {
1283 cgp->cg_irotor = ipref;
1284 goto gotit;
1285 }
1286 }
1287 printf("fs = %s\n", fs->fs_fsmnt);
1288 panic("ffs_nodealloccg: block not in map");
1289 /* NOTREACHED */
1290 gotit:
1291 setbit(cg_inosused(cgp), ipref);
1292 cgp->cg_cs.cs_nifree--;
1293 fs->fs_cstotal.cs_nifree--;
1294 fs->fs_cs(fs, cg).cs_nifree--;
1295 fs->fs_fmod = 1;
1296 if ((mode & IFMT) == IFDIR) {
1297 cgp->cg_cs.cs_ndir++;
1298 fs->fs_cstotal.cs_ndir++;
1299 fs->fs_cs(fs, cg).cs_ndir++;
1300 }
1301 #if REV_ENDIAN_FS
1302 if (rev_endian)
1303 byte_swap_cgout(cgp,fs);
1304 #endif /* REV_ENDIAN_FS */
1305 buf_bdwrite(bp);
1306 return (cg * fs->fs_ipg + ipref);
1307 }
1308
1309 /*
1310 * Free a block or fragment.
1311 *
1312 * The specified block or fragment is placed back in the
1313 * free map. If a fragment is deallocated, a possible
1314 * block reassembly is checked.
1315 */
1316 void
1317 ffs_blkfree(ip, bno, size)
1318 register struct inode *ip;
1319 ufs_daddr_t bno;
1320 long size;
1321 {
1322 register struct fs *fs;
1323 register struct cg *cgp;
1324 struct buf *bp;
1325 struct timeval tv;
1326 ufs_daddr_t blkno;
1327 int i, error, cg, blk, frags, bbase;
1328 #if REV_ENDIAN_FS
1329 struct vnode *vp=ITOV(ip);
1330 struct mount *mp=vp->v_mount;
1331 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1332 #endif /* REV_ENDIAN_FS */
1333
1334 fs = ip->i_fs;
1335 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
1336 printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
1337 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
1338 panic("blkfree: bad size");
1339 }
1340 cg = dtog(fs, bno);
1341 if ((u_int)bno >= fs->fs_size) {
1342 printf("bad block %d, ino %d\n", bno, ip->i_number);
1343 ffs_fserr(fs, ip->i_uid, "bad block");
1344 return;
1345 }
1346 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
1347 (int)fs->fs_cgsize, NOCRED, &bp);
1348 if (error) {
1349 buf_brelse(bp);
1350 return;
1351 }
1352 cgp = (struct cg *)buf_dataptr(bp);
1353 #if REV_ENDIAN_FS
1354 if (rev_endian)
1355 byte_swap_cgin(cgp,fs);
1356 #endif /* REV_ENDIAN_FS */
1357 if (!cg_chkmagic(cgp)) {
1358 #if REV_ENDIAN_FS
1359 if (rev_endian)
1360 byte_swap_cgout(cgp,fs);
1361 #endif /* REV_ENDIAN_FS */
1362 buf_brelse(bp);
1363 return;
1364 }
1365 microtime(&tv);
1366 cgp->cg_time = tv.tv_sec;
1367 bno = dtogd(fs, bno);
1368 if (size == fs->fs_bsize) {
1369 blkno = fragstoblks(fs, bno);
1370 if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) {
1371 printf("dev = 0x%x, block = %d, fs = %s\n",
1372 ip->i_dev, bno, fs->fs_fsmnt);
1373 panic("blkfree: freeing free block");
1374 }
1375 ffs_setblock(fs, cg_blksfree(cgp), blkno);
1376 ffs_clusteracct(fs, cgp, blkno, 1);
1377 cgp->cg_cs.cs_nbfree++;
1378 fs->fs_cstotal.cs_nbfree++;
1379 fs->fs_cs(fs, cg).cs_nbfree++;
1380 i = cbtocylno(fs, bno);
1381 cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++;
1382 cg_blktot(cgp)[i]++;
1383 } else {
1384 bbase = bno - fragnum(fs, bno);
1385 /*
1386 * decrement the counts associated with the old frags
1387 */
1388 blk = blkmap(fs, cg_blksfree(cgp), bbase);
1389 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
1390 /*
1391 * deallocate the fragment
1392 */
1393 frags = numfrags(fs, size);
1394 for (i = 0; i < frags; i++) {
1395 if (isset(cg_blksfree(cgp), bno + i)) {
1396 printf("dev = 0x%x, block = %d, fs = %s\n",
1397 ip->i_dev, bno + i, fs->fs_fsmnt);
1398 panic("blkfree: freeing free frag");
1399 }
1400 setbit(cg_blksfree(cgp), bno + i);
1401 }
1402 cgp->cg_cs.cs_nffree += i;
1403 fs->fs_cstotal.cs_nffree += i;
1404 fs->fs_cs(fs, cg).cs_nffree += i;
1405 /*
1406 * add back in counts associated with the new frags
1407 */
1408 blk = blkmap(fs, cg_blksfree(cgp), bbase);
1409 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
1410 /*
1411 * if a complete block has been reassembled, account for it
1412 */
1413 blkno = fragstoblks(fs, bbase);
1414 if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) {
1415 cgp->cg_cs.cs_nffree -= fs->fs_frag;
1416 fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1417 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1418 ffs_clusteracct(fs, cgp, blkno, 1);
1419 cgp->cg_cs.cs_nbfree++;
1420 fs->fs_cstotal.cs_nbfree++;
1421 fs->fs_cs(fs, cg).cs_nbfree++;
1422 i = cbtocylno(fs, bbase);
1423 cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++;
1424 cg_blktot(cgp)[i]++;
1425 }
1426 }
1427 fs->fs_fmod = 1;
1428 #if REV_ENDIAN_FS
1429 if (rev_endian)
1430 byte_swap_cgout(cgp,fs);
1431 #endif /* REV_ENDIAN_FS */
1432 buf_bdwrite(bp);
1433 }
1434
1435 #if DIAGNOSTIC
1436 /*
1437 * Verify allocation of a block or fragment. Returns true if block or
1438 * fragment is allocated, false if it is free.
1439 */
1440 ffs_checkblk(ip, bno, size)
1441 struct inode *ip;
1442 ufs_daddr_t bno;
1443 long size;
1444 {
1445 struct fs *fs;
1446 struct cg *cgp;
1447 struct buf *bp;
1448 int i, error, frags, free;
1449 #if REV_ENDIAN_FS
1450 struct vnode *vp=ITOV(ip);
1451 struct mount *mp=vp->v_mount;
1452 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1453 #endif /* REV_ENDIAN_FS */
1454
1455 fs = ip->i_fs;
1456 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
1457 printf("bsize = %d, size = %d, fs = %s\n",
1458 fs->fs_bsize, size, fs->fs_fsmnt);
1459 panic("checkblk: bad size");
1460 }
1461 if ((u_int)bno >= fs->fs_size)
1462 panic("checkblk: bad block %d", bno);
1463 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, dtog(fs, bno)))),
1464 (int)fs->fs_cgsize, NOCRED, &bp);
1465 if (error) {
1466 buf_brelse(bp);
1467 return;
1468 }
1469 cgp = (struct cg *)buf_dataptr(bp);
1470 #if REV_ENDIAN_FS
1471 if (rev_endian)
1472 byte_swap_cgin(cgp,fs);
1473 #endif /* REV_ENDIAN_FS */
1474 if (!cg_chkmagic(cgp)) {
1475 #if REV_ENDIAN_FS
1476 if (rev_endian)
1477 byte_swap_cgout(cgp,fs);
1478 #endif /* REV_ENDIAN_FS */
1479 buf_brelse(bp);
1480 return;
1481 }
1482 bno = dtogd(fs, bno);
1483 if (size == fs->fs_bsize) {
1484 free = ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno));
1485 } else {
1486 frags = numfrags(fs, size);
1487 for (free = 0, i = 0; i < frags; i++)
1488 if (isset(cg_blksfree(cgp), bno + i))
1489 free++;
1490 if (free != 0 && free != frags)
1491 panic("checkblk: partially free fragment");
1492 }
1493 #if REV_ENDIAN_FS
1494 if (rev_endian)
1495 byte_swap_cgout(cgp,fs);
1496 #endif /* REV_ENDIAN_FS */
1497 buf_brelse(bp);
1498 return (!free);
1499 }
1500 #endif /* DIAGNOSTIC */
1501
1502 /*
1503 * Free an inode.
1504 *
1505 * The specified inode is placed back in the free map.
1506 */
1507 int
1508 ffs_vfree(struct vnode *vp, ino_t ino, int mode)
1509 {
1510 register struct fs *fs;
1511 register struct cg *cgp;
1512 register struct inode *pip;
1513 struct buf *bp;
1514 struct timeval tv;
1515 int error, cg;
1516 #if REV_ENDIAN_FS
1517 struct mount *mp=vp->v_mount;
1518 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1519 #endif /* REV_ENDIAN_FS */
1520
1521 pip = VTOI(vp);
1522 fs = pip->i_fs;
1523 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
1524 panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n",
1525 pip->i_dev, ino, fs->fs_fsmnt);
1526 cg = ino_to_cg(fs, ino);
1527 error = (int)buf_bread(pip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
1528 (int)fs->fs_cgsize, NOCRED, &bp);
1529 if (error) {
1530 buf_brelse(bp);
1531 return (0);
1532 }
1533 cgp = (struct cg *)buf_dataptr(bp);
1534 #if REV_ENDIAN_FS
1535 if (rev_endian)
1536 byte_swap_cgin(cgp,fs);
1537 #endif /* REV_ENDIAN_FS */
1538 if (!cg_chkmagic(cgp)) {
1539 #if REV_ENDIAN_FS
1540 if (rev_endian)
1541 byte_swap_cgout(cgp,fs);
1542 #endif /* REV_ENDIAN_FS */
1543 buf_brelse(bp);
1544 return (0);
1545 }
1546 microtime(&tv);
1547 cgp->cg_time = tv.tv_sec;
1548 ino %= fs->fs_ipg;
1549 if (isclr(cg_inosused(cgp), ino)) {
1550 printf("dev = 0x%x, ino = %d, fs = %s\n",
1551 pip->i_dev, ino, fs->fs_fsmnt);
1552 if (fs->fs_ronly == 0)
1553 panic("ifree: freeing free inode");
1554 }
1555 clrbit(cg_inosused(cgp), ino);
1556 if (ino < cgp->cg_irotor)
1557 cgp->cg_irotor = ino;
1558 cgp->cg_cs.cs_nifree++;
1559 fs->fs_cstotal.cs_nifree++;
1560 fs->fs_cs(fs, cg).cs_nifree++;
1561 if ((mode & IFMT) == IFDIR) {
1562 cgp->cg_cs.cs_ndir--;
1563 fs->fs_cstotal.cs_ndir--;
1564 fs->fs_cs(fs, cg).cs_ndir--;
1565 }
1566 fs->fs_fmod = 1;
1567 #if REV_ENDIAN_FS
1568 if (rev_endian)
1569 byte_swap_cgout(cgp,fs);
1570 #endif /* REV_ENDIAN_FS */
1571 buf_bdwrite(bp);
1572 return (0);
1573 }
1574
1575 /*
1576 * Find a block of the specified size in the specified cylinder group.
1577 *
1578 * It is a panic if a request is made to find a block if none are
1579 * available.
1580 */
1581 static ufs_daddr_t
1582 ffs_mapsearch(fs, cgp, bpref, allocsiz)
1583 register struct fs *fs;
1584 register struct cg *cgp;
1585 ufs_daddr_t bpref;
1586 int allocsiz;
1587 {
1588 ufs_daddr_t bno;
1589 int start, len, loc, i;
1590 int blk, field, subfield, pos;
1591
1592 /*
1593 * find the fragment by searching through the free block
1594 * map for an appropriate bit pattern
1595 */
1596 if (bpref)
1597 start = dtogd(fs, bpref) / NBBY;
1598 else
1599 start = cgp->cg_frotor / NBBY;
1600 len = howmany(fs->fs_fpg, NBBY) - start;
1601 loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[start],
1602 (u_char *)fragtbl[fs->fs_frag],
1603 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1604 if (loc == 0) {
1605 len = start + 1;
1606 start = 0;
1607 loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[0],
1608 (u_char *)fragtbl[fs->fs_frag],
1609 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1610 if (loc == 0) {
1611 printf("start = %d, len = %d, fs = %s\n",
1612 start, len, fs->fs_fsmnt);
1613 panic("ffs_alloccg: map corrupted");
1614 /* NOTREACHED */
1615 }
1616 }
1617 bno = (start + len - loc) * NBBY;
1618 cgp->cg_frotor = bno;
1619 /*
1620 * found the byte in the map
1621 * sift through the bits to find the selected frag
1622 */
1623 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1624 blk = blkmap(fs, cg_blksfree(cgp), bno);
1625 blk <<= 1;
1626 field = around[allocsiz];
1627 subfield = inside[allocsiz];
1628 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
1629 if ((blk & field) == subfield)
1630 return (bno + pos);
1631 field <<= 1;
1632 subfield <<= 1;
1633 }
1634 }
1635 printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
1636 panic("ffs_alloccg: block not in map");
1637 return (-1);
1638 }
1639
1640 /*
1641 * Update the cluster map because of an allocation or free.
1642 *
1643 * Cnt == 1 means free; cnt == -1 means allocating.
1644 */
1645 static void
1646 ffs_clusteracct(struct fs *fs, struct cg *cgp, ufs_daddr_t blkno, int cnt)
1647 {
1648 int32_t *sump;
1649 int32_t *lp;
1650 u_char *freemapp, *mapp;
1651 int i, start, end, forw, back, map, bit;
1652
1653 if (fs->fs_contigsumsize <= 0)
1654 return;
1655 freemapp = cg_clustersfree(cgp);
1656 sump = cg_clustersum(cgp);
1657 /*
1658 * Allocate or clear the actual block.
1659 */
1660 if (cnt > 0)
1661 setbit(freemapp, blkno);
1662 else
1663 clrbit(freemapp, blkno);
1664 /*
1665 * Find the size of the cluster going forward.
1666 */
1667 start = blkno + 1;
1668 end = start + fs->fs_contigsumsize;
1669 if (end >= cgp->cg_nclusterblks)
1670 end = cgp->cg_nclusterblks;
1671 mapp = &freemapp[start / NBBY];
1672 map = *mapp++;
1673 bit = 1 << (start % NBBY);
1674 for (i = start; i < end; i++) {
1675 if ((map & bit) == 0)
1676 break;
1677 if ((i & (NBBY - 1)) != (NBBY - 1)) {
1678 bit <<= 1;
1679 } else {
1680 map = *mapp++;
1681 bit = 1;
1682 }
1683 }
1684 forw = i - start;
1685 /*
1686 * Find the size of the cluster going backward.
1687 */
1688 start = blkno - 1;
1689 end = start - fs->fs_contigsumsize;
1690 if (end < 0)
1691 end = -1;
1692 mapp = &freemapp[start / NBBY];
1693 map = *mapp--;
1694 bit = 1 << (start % NBBY);
1695 for (i = start; i > end; i--) {
1696 if ((map & bit) == 0)
1697 break;
1698 if ((i & (NBBY - 1)) != 0) {
1699 bit >>= 1;
1700 } else {
1701 map = *mapp--;
1702 bit = 1 << (NBBY - 1);
1703 }
1704 }
1705 back = start - i;
1706 /*
1707 * Account for old cluster and the possibly new forward and
1708 * back clusters.
1709 */
1710 i = back + forw + 1;
1711 if (i > fs->fs_contigsumsize)
1712 i = fs->fs_contigsumsize;
1713 sump[i] += cnt;
1714 if (back > 0)
1715 sump[back] -= cnt;
1716 if (forw > 0)
1717 sump[forw] -= cnt;
1718 /*
1719 * Update cluster summary information.
1720 */
1721 lp = &sump[fs->fs_contigsumsize];
1722 for (i = fs->fs_contigsumsize; i > 0; i--)
1723 if (*lp-- > 0)
1724 break;
1725 fs->fs_maxcluster[cgp->cg_cgx] = i;
1726 }
1727
1728 /*
1729 * Fserr prints the name of a file system with an error diagnostic.
1730 *
1731 * The form of the error message is:
1732 * fs: error message
1733 */
1734 static void
1735 ffs_fserr(fs, uid, cp)
1736 struct fs *fs;
1737 u_int uid;
1738 char *cp;
1739 {
1740
1741 log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp);
1742 }