/[base]/head/sys/kern/vfs_subr.c
ViewVC logotype

Contents of /head/sys/kern/vfs_subr.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 234400 - (show annotations) (download)
Tue Apr 17 21:46:59 2012 UTC (12 years, 2 months ago) by mckusick
File MIME type: text/plain
File size: 116669 byte(s)
Drop export of vdestroy() function from kern/vfs_subr.c as it is
used only as a helper function in that file. Replace sole call to
vbusy() with inline code in vholdl(). Replace sole calls to vfree()
and vdestroy() with inline code in vdropl().

The Clang compiler already inlines these functions, so they do not
show up in a kernel backtrace which is confusing. Also you cannot
set their frame in kgdb which means that it is impossible to view
their local variables. So, while the produced code is unchanged,
the debugging should be easier.

Discussed with: kib
MFC after:      2 weeks

1 /*-
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
35 */
36
37 /*
38 * External virtual filesystem routines
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 #include "opt_ddb.h"
45 #include "opt_watchdog.h"
46
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/bio.h>
50 #include <sys/buf.h>
51 #include <sys/condvar.h>
52 #include <sys/conf.h>
53 #include <sys/dirent.h>
54 #include <sys/event.h>
55 #include <sys/eventhandler.h>
56 #include <sys/extattr.h>
57 #include <sys/file.h>
58 #include <sys/fcntl.h>
59 #include <sys/jail.h>
60 #include <sys/kdb.h>
61 #include <sys/kernel.h>
62 #include <sys/kthread.h>
63 #include <sys/lockf.h>
64 #include <sys/malloc.h>
65 #include <sys/mount.h>
66 #include <sys/namei.h>
67 #include <sys/priv.h>
68 #include <sys/reboot.h>
69 #include <sys/sched.h>
70 #include <sys/sleepqueue.h>
71 #include <sys/stat.h>
72 #include <sys/sysctl.h>
73 #include <sys/syslog.h>
74 #include <sys/vmmeter.h>
75 #include <sys/vnode.h>
76 #ifdef SW_WATCHDOG
77 #include <sys/watchdog.h>
78 #endif
79
80 #include <machine/stdarg.h>
81
82 #include <security/mac/mac_framework.h>
83
84 #include <vm/vm.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_extern.h>
87 #include <vm/pmap.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_kern.h>
91 #include <vm/uma.h>
92
93 #ifdef DDB
94 #include <ddb/ddb.h>
95 #endif
96
97 #define WI_MPSAFEQ 0
98 #define WI_GIANTQ 1
99
100 static void delmntque(struct vnode *vp);
101 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102 int slpflag, int slptimeo);
103 static void syncer_shutdown(void *arg, int howto);
104 static int vtryrecycle(struct vnode *vp);
105 static void v_incr_usecount(struct vnode *);
106 static void v_decr_usecount(struct vnode *);
107 static void v_decr_useonly(struct vnode *);
108 static void v_upgrade_usecount(struct vnode *);
109 static void vnlru_free(int);
110 static void vgonel(struct vnode *);
111 static void vfs_knllock(void *arg);
112 static void vfs_knlunlock(void *arg);
113 static void vfs_knl_assert_locked(void *arg);
114 static void vfs_knl_assert_unlocked(void *arg);
115 static void destroy_vpollinfo(struct vpollinfo *vi);
116
117 /*
118 * Number of vnodes in existence. Increased whenever getnewvnode()
119 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
120 */
121 static unsigned long numvnodes;
122
123 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
124 "Number of vnodes in existence");
125
126 /*
127 * Conversion tables for conversion from vnode types to inode formats
128 * and back.
129 */
130 enum vtype iftovt_tab[16] = {
131 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
132 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
133 };
134 int vttoif_tab[10] = {
135 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
136 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
137 };
138
139 /*
140 * List of vnodes that are ready for recycling.
141 */
142 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
143
144 /*
145 * Free vnode target. Free vnodes may simply be files which have been stat'd
146 * but not read. This is somewhat common, and a small cache of such files
147 * should be kept to avoid recreation costs.
148 */
149 static u_long wantfreevnodes;
150 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
151 /* Number of vnodes in the free list. */
152 static u_long freevnodes;
153 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
154 "Number of vnodes in the free list");
155
156 static int vlru_allow_cache_src;
157 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
158 &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
159
160 /*
161 * Various variables used for debugging the new implementation of
162 * reassignbuf().
163 * XXX these are probably of (very) limited utility now.
164 */
165 static int reassignbufcalls;
166 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
167 "Number of calls to reassignbuf");
168
169 /*
170 * Cache for the mount type id assigned to NFS. This is used for
171 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
172 */
173 int nfs_mount_type = -1;
174
175 /* To keep more than one thread at a time from running vfs_getnewfsid */
176 static struct mtx mntid_mtx;
177
178 /*
179 * Lock for any access to the following:
180 * vnode_free_list
181 * numvnodes
182 * freevnodes
183 */
184 static struct mtx vnode_free_list_mtx;
185
186 /* Publicly exported FS */
187 struct nfs_public nfs_pub;
188
189 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
190 static uma_zone_t vnode_zone;
191 static uma_zone_t vnodepoll_zone;
192
193 /*
194 * The workitem queue.
195 *
196 * It is useful to delay writes of file data and filesystem metadata
197 * for tens of seconds so that quickly created and deleted files need
198 * not waste disk bandwidth being created and removed. To realize this,
199 * we append vnodes to a "workitem" queue. When running with a soft
200 * updates implementation, most pending metadata dependencies should
201 * not wait for more than a few seconds. Thus, mounted on block devices
202 * are delayed only about a half the time that file data is delayed.
203 * Similarly, directory updates are more critical, so are only delayed
204 * about a third the time that file data is delayed. Thus, there are
205 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
206 * one each second (driven off the filesystem syncer process). The
207 * syncer_delayno variable indicates the next queue that is to be processed.
208 * Items that need to be processed soon are placed in this queue:
209 *
210 * syncer_workitem_pending[syncer_delayno]
211 *
212 * A delay of fifteen seconds is done by placing the request fifteen
213 * entries later in the queue:
214 *
215 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
216 *
217 */
218 static int syncer_delayno;
219 static long syncer_mask;
220 LIST_HEAD(synclist, bufobj);
221 static struct synclist *syncer_workitem_pending[2];
222 /*
223 * The sync_mtx protects:
224 * bo->bo_synclist
225 * sync_vnode_count
226 * syncer_delayno
227 * syncer_state
228 * syncer_workitem_pending
229 * syncer_worklist_len
230 * rushjob
231 */
232 static struct mtx sync_mtx;
233 static struct cv sync_wakeup;
234
235 #define SYNCER_MAXDELAY 32
236 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
237 static int syncdelay = 30; /* max time to delay syncing data */
238 static int filedelay = 30; /* time to delay syncing files */
239 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
240 "Time to delay syncing files (in seconds)");
241 static int dirdelay = 29; /* time to delay syncing directories */
242 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
243 "Time to delay syncing directories (in seconds)");
244 static int metadelay = 28; /* time to delay syncing metadata */
245 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
246 "Time to delay syncing metadata (in seconds)");
247 static int rushjob; /* number of slots to run ASAP */
248 static int stat_rush_requests; /* number of times I/O speeded up */
249 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
250 "Number of times I/O speeded up (rush requests)");
251
252 /*
253 * When shutting down the syncer, run it at four times normal speed.
254 */
255 #define SYNCER_SHUTDOWN_SPEEDUP 4
256 static int sync_vnode_count;
257 static int syncer_worklist_len;
258 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
259 syncer_state;
260
261 /*
262 * Number of vnodes we want to exist at any one time. This is mostly used
263 * to size hash tables in vnode-related code. It is normally not used in
264 * getnewvnode(), as wantfreevnodes is normally nonzero.)
265 *
266 * XXX desiredvnodes is historical cruft and should not exist.
267 */
268 int desiredvnodes;
269 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
270 &desiredvnodes, 0, "Maximum number of vnodes");
271 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
272 &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
273 static int vnlru_nowhere;
274 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
275 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
276
277 /*
278 * Macros to control when a vnode is freed and recycled. All require
279 * the vnode interlock.
280 */
281 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
282 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
283 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
284
285
286 /*
287 * Initialize the vnode management data structures.
288 *
289 * Reevaluate the following cap on the number of vnodes after the physical
290 * memory size exceeds 512GB. In the limit, as the physical memory size
291 * grows, the ratio of physical pages to vnodes approaches sixteen to one.
292 */
293 #ifndef MAXVNODES_MAX
294 #define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
295 #endif
296 static void
297 vntblinit(void *dummy __unused)
298 {
299 int physvnodes, virtvnodes;
300
301 /*
302 * Desiredvnodes is a function of the physical memory size and the
303 * kernel's heap size. Generally speaking, it scales with the
304 * physical memory size. The ratio of desiredvnodes to physical pages
305 * is one to four until desiredvnodes exceeds 98,304. Thereafter, the
306 * marginal ratio of desiredvnodes to physical pages is one to
307 * sixteen. However, desiredvnodes is limited by the kernel's heap
308 * size. The memory required by desiredvnodes vnodes and vm objects
309 * may not exceed one seventh of the kernel's heap size.
310 */
311 physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
312 cnt.v_page_count) / 16;
313 virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
314 sizeof(struct vnode)));
315 desiredvnodes = min(physvnodes, virtvnodes);
316 if (desiredvnodes > MAXVNODES_MAX) {
317 if (bootverbose)
318 printf("Reducing kern.maxvnodes %d -> %d\n",
319 desiredvnodes, MAXVNODES_MAX);
320 desiredvnodes = MAXVNODES_MAX;
321 }
322 wantfreevnodes = desiredvnodes / 4;
323 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
324 TAILQ_INIT(&vnode_free_list);
325 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
326 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
327 NULL, NULL, UMA_ALIGN_PTR, 0);
328 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
329 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
330 /*
331 * Initialize the filesystem syncer.
332 */
333 syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
334 &syncer_mask);
335 syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
336 &syncer_mask);
337 syncer_maxdelay = syncer_mask + 1;
338 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
339 cv_init(&sync_wakeup, "syncer");
340 }
341 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
342
343
344 /*
345 * Mark a mount point as busy. Used to synchronize access and to delay
346 * unmounting. Eventually, mountlist_mtx is not released on failure.
347 *
348 * vfs_busy() is a custom lock, it can block the caller.
349 * vfs_busy() only sleeps if the unmount is active on the mount point.
350 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
351 * vnode belonging to mp.
352 *
353 * Lookup uses vfs_busy() to traverse mount points.
354 * root fs var fs
355 * / vnode lock A / vnode lock (/var) D
356 * /var vnode lock B /log vnode lock(/var/log) E
357 * vfs_busy lock C vfs_busy lock F
358 *
359 * Within each file system, the lock order is C->A->B and F->D->E.
360 *
361 * When traversing across mounts, the system follows that lock order:
362 *
363 * C->A->B
364 * |
365 * +->F->D->E
366 *
367 * The lookup() process for namei("/var") illustrates the process:
368 * VOP_LOOKUP() obtains B while A is held
369 * vfs_busy() obtains a shared lock on F while A and B are held
370 * vput() releases lock on B
371 * vput() releases lock on A
372 * VFS_ROOT() obtains lock on D while shared lock on F is held
373 * vfs_unbusy() releases shared lock on F
374 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
375 * Attempt to lock A (instead of vp_crossmp) while D is held would
376 * violate the global order, causing deadlocks.
377 *
378 * dounmount() locks B while F is drained.
379 */
380 int
381 vfs_busy(struct mount *mp, int flags)
382 {
383
384 MPASS((flags & ~MBF_MASK) == 0);
385 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
386
387 MNT_ILOCK(mp);
388 MNT_REF(mp);
389 /*
390 * If mount point is currenly being unmounted, sleep until the
391 * mount point fate is decided. If thread doing the unmounting fails,
392 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
393 * that this mount point has survived the unmount attempt and vfs_busy
394 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
395 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
396 * about to be really destroyed. vfs_busy needs to release its
397 * reference on the mount point in this case and return with ENOENT,
398 * telling the caller that mount mount it tried to busy is no longer
399 * valid.
400 */
401 while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
402 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
403 MNT_REL(mp);
404 MNT_IUNLOCK(mp);
405 CTR1(KTR_VFS, "%s: failed busying before sleeping",
406 __func__);
407 return (ENOENT);
408 }
409 if (flags & MBF_MNTLSTLOCK)
410 mtx_unlock(&mountlist_mtx);
411 mp->mnt_kern_flag |= MNTK_MWAIT;
412 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
413 if (flags & MBF_MNTLSTLOCK)
414 mtx_lock(&mountlist_mtx);
415 MNT_ILOCK(mp);
416 }
417 if (flags & MBF_MNTLSTLOCK)
418 mtx_unlock(&mountlist_mtx);
419 mp->mnt_lockref++;
420 MNT_IUNLOCK(mp);
421 return (0);
422 }
423
424 /*
425 * Free a busy filesystem.
426 */
427 void
428 vfs_unbusy(struct mount *mp)
429 {
430
431 CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
432 MNT_ILOCK(mp);
433 MNT_REL(mp);
434 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
435 mp->mnt_lockref--;
436 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
437 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
438 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
439 mp->mnt_kern_flag &= ~MNTK_DRAINING;
440 wakeup(&mp->mnt_lockref);
441 }
442 MNT_IUNLOCK(mp);
443 }
444
445 /*
446 * Lookup a mount point by filesystem identifier.
447 */
448 struct mount *
449 vfs_getvfs(fsid_t *fsid)
450 {
451 struct mount *mp;
452
453 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
454 mtx_lock(&mountlist_mtx);
455 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
456 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
457 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
458 vfs_ref(mp);
459 mtx_unlock(&mountlist_mtx);
460 return (mp);
461 }
462 }
463 mtx_unlock(&mountlist_mtx);
464 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
465 return ((struct mount *) 0);
466 }
467
468 /*
469 * Lookup a mount point by filesystem identifier, busying it before
470 * returning.
471 */
472 struct mount *
473 vfs_busyfs(fsid_t *fsid)
474 {
475 struct mount *mp;
476 int error;
477
478 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
479 mtx_lock(&mountlist_mtx);
480 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
481 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
482 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
483 error = vfs_busy(mp, MBF_MNTLSTLOCK);
484 if (error) {
485 mtx_unlock(&mountlist_mtx);
486 return (NULL);
487 }
488 return (mp);
489 }
490 }
491 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
492 mtx_unlock(&mountlist_mtx);
493 return ((struct mount *) 0);
494 }
495
496 /*
497 * Check if a user can access privileged mount options.
498 */
499 int
500 vfs_suser(struct mount *mp, struct thread *td)
501 {
502 int error;
503
504 /*
505 * If the thread is jailed, but this is not a jail-friendly file
506 * system, deny immediately.
507 */
508 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
509 return (EPERM);
510
511 /*
512 * If the file system was mounted outside the jail of the calling
513 * thread, deny immediately.
514 */
515 if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
516 return (EPERM);
517
518 /*
519 * If file system supports delegated administration, we don't check
520 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
521 * by the file system itself.
522 * If this is not the user that did original mount, we check for
523 * the PRIV_VFS_MOUNT_OWNER privilege.
524 */
525 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
526 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
527 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
528 return (error);
529 }
530 return (0);
531 }
532
533 /*
534 * Get a new unique fsid. Try to make its val[0] unique, since this value
535 * will be used to create fake device numbers for stat(). Also try (but
536 * not so hard) make its val[0] unique mod 2^16, since some emulators only
537 * support 16-bit device numbers. We end up with unique val[0]'s for the
538 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
539 *
540 * Keep in mind that several mounts may be running in parallel. Starting
541 * the search one past where the previous search terminated is both a
542 * micro-optimization and a defense against returning the same fsid to
543 * different mounts.
544 */
545 void
546 vfs_getnewfsid(struct mount *mp)
547 {
548 static uint16_t mntid_base;
549 struct mount *nmp;
550 fsid_t tfsid;
551 int mtype;
552
553 CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
554 mtx_lock(&mntid_mtx);
555 mtype = mp->mnt_vfc->vfc_typenum;
556 tfsid.val[1] = mtype;
557 mtype = (mtype & 0xFF) << 24;
558 for (;;) {
559 tfsid.val[0] = makedev(255,
560 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
561 mntid_base++;
562 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
563 break;
564 vfs_rel(nmp);
565 }
566 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
567 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
568 mtx_unlock(&mntid_mtx);
569 }
570
571 /*
572 * Knob to control the precision of file timestamps:
573 *
574 * 0 = seconds only; nanoseconds zeroed.
575 * 1 = seconds and nanoseconds, accurate within 1/HZ.
576 * 2 = seconds and nanoseconds, truncated to microseconds.
577 * >=3 = seconds and nanoseconds, maximum precision.
578 */
579 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
580
581 static int timestamp_precision = TSP_SEC;
582 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
583 &timestamp_precision, 0, "File timestamp precision (0: seconds, "
584 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
585 "3+: sec + ns (max. precision))");
586
587 /*
588 * Get a current timestamp.
589 */
590 void
591 vfs_timestamp(struct timespec *tsp)
592 {
593 struct timeval tv;
594
595 switch (timestamp_precision) {
596 case TSP_SEC:
597 tsp->tv_sec = time_second;
598 tsp->tv_nsec = 0;
599 break;
600 case TSP_HZ:
601 getnanotime(tsp);
602 break;
603 case TSP_USEC:
604 microtime(&tv);
605 TIMEVAL_TO_TIMESPEC(&tv, tsp);
606 break;
607 case TSP_NSEC:
608 default:
609 nanotime(tsp);
610 break;
611 }
612 }
613
614 /*
615 * Set vnode attributes to VNOVAL
616 */
617 void
618 vattr_null(struct vattr *vap)
619 {
620
621 vap->va_type = VNON;
622 vap->va_size = VNOVAL;
623 vap->va_bytes = VNOVAL;
624 vap->va_mode = VNOVAL;
625 vap->va_nlink = VNOVAL;
626 vap->va_uid = VNOVAL;
627 vap->va_gid = VNOVAL;
628 vap->va_fsid = VNOVAL;
629 vap->va_fileid = VNOVAL;
630 vap->va_blocksize = VNOVAL;
631 vap->va_rdev = VNOVAL;
632 vap->va_atime.tv_sec = VNOVAL;
633 vap->va_atime.tv_nsec = VNOVAL;
634 vap->va_mtime.tv_sec = VNOVAL;
635 vap->va_mtime.tv_nsec = VNOVAL;
636 vap->va_ctime.tv_sec = VNOVAL;
637 vap->va_ctime.tv_nsec = VNOVAL;
638 vap->va_birthtime.tv_sec = VNOVAL;
639 vap->va_birthtime.tv_nsec = VNOVAL;
640 vap->va_flags = VNOVAL;
641 vap->va_gen = VNOVAL;
642 vap->va_vaflags = 0;
643 }
644
645 /*
646 * This routine is called when we have too many vnodes. It attempts
647 * to free <count> vnodes and will potentially free vnodes that still
648 * have VM backing store (VM backing store is typically the cause
649 * of a vnode blowout so we want to do this). Therefore, this operation
650 * is not considered cheap.
651 *
652 * A number of conditions may prevent a vnode from being reclaimed.
653 * the buffer cache may have references on the vnode, a directory
654 * vnode may still have references due to the namei cache representing
655 * underlying files, or the vnode may be in active use. It is not
656 * desireable to reuse such vnodes. These conditions may cause the
657 * number of vnodes to reach some minimum value regardless of what
658 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
659 */
660 static int
661 vlrureclaim(struct mount *mp)
662 {
663 struct vnode *vp;
664 int done;
665 int trigger;
666 int usevnodes;
667 int count;
668
669 /*
670 * Calculate the trigger point, don't allow user
671 * screwups to blow us up. This prevents us from
672 * recycling vnodes with lots of resident pages. We
673 * aren't trying to free memory, we are trying to
674 * free vnodes.
675 */
676 usevnodes = desiredvnodes;
677 if (usevnodes <= 0)
678 usevnodes = 1;
679 trigger = cnt.v_page_count * 2 / usevnodes;
680 done = 0;
681 vn_start_write(NULL, &mp, V_WAIT);
682 MNT_ILOCK(mp);
683 count = mp->mnt_nvnodelistsize / 10 + 1;
684 while (count != 0) {
685 vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
686 while (vp != NULL && vp->v_type == VMARKER)
687 vp = TAILQ_NEXT(vp, v_nmntvnodes);
688 if (vp == NULL)
689 break;
690 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
691 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
692 --count;
693 if (!VI_TRYLOCK(vp))
694 goto next_iter;
695 /*
696 * If it's been deconstructed already, it's still
697 * referenced, or it exceeds the trigger, skip it.
698 */
699 if (vp->v_usecount ||
700 (!vlru_allow_cache_src &&
701 !LIST_EMPTY(&(vp)->v_cache_src)) ||
702 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
703 vp->v_object->resident_page_count > trigger)) {
704 VI_UNLOCK(vp);
705 goto next_iter;
706 }
707 MNT_IUNLOCK(mp);
708 vholdl(vp);
709 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
710 vdrop(vp);
711 goto next_iter_mntunlocked;
712 }
713 VI_LOCK(vp);
714 /*
715 * v_usecount may have been bumped after VOP_LOCK() dropped
716 * the vnode interlock and before it was locked again.
717 *
718 * It is not necessary to recheck VI_DOOMED because it can
719 * only be set by another thread that holds both the vnode
720 * lock and vnode interlock. If another thread has the
721 * vnode lock before we get to VOP_LOCK() and obtains the
722 * vnode interlock after VOP_LOCK() drops the vnode
723 * interlock, the other thread will be unable to drop the
724 * vnode lock before our VOP_LOCK() call fails.
725 */
726 if (vp->v_usecount ||
727 (!vlru_allow_cache_src &&
728 !LIST_EMPTY(&(vp)->v_cache_src)) ||
729 (vp->v_object != NULL &&
730 vp->v_object->resident_page_count > trigger)) {
731 VOP_UNLOCK(vp, LK_INTERLOCK);
732 goto next_iter_mntunlocked;
733 }
734 KASSERT((vp->v_iflag & VI_DOOMED) == 0,
735 ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
736 vgonel(vp);
737 VOP_UNLOCK(vp, 0);
738 vdropl(vp);
739 done++;
740 next_iter_mntunlocked:
741 if (!should_yield())
742 goto relock_mnt;
743 goto yield;
744 next_iter:
745 if (!should_yield())
746 continue;
747 MNT_IUNLOCK(mp);
748 yield:
749 kern_yield(PRI_UNCHANGED);
750 relock_mnt:
751 MNT_ILOCK(mp);
752 }
753 MNT_IUNLOCK(mp);
754 vn_finished_write(mp);
755 return done;
756 }
757
758 /*
759 * Attempt to keep the free list at wantfreevnodes length.
760 */
761 static void
762 vnlru_free(int count)
763 {
764 struct vnode *vp;
765 int vfslocked;
766
767 mtx_assert(&vnode_free_list_mtx, MA_OWNED);
768 for (; count > 0; count--) {
769 vp = TAILQ_FIRST(&vnode_free_list);
770 /*
771 * The list can be modified while the free_list_mtx
772 * has been dropped and vp could be NULL here.
773 */
774 if (!vp)
775 break;
776 VNASSERT(vp->v_op != NULL, vp,
777 ("vnlru_free: vnode already reclaimed."));
778 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
779 /*
780 * Don't recycle if we can't get the interlock.
781 */
782 if (!VI_TRYLOCK(vp)) {
783 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
784 continue;
785 }
786 VNASSERT(VCANRECYCLE(vp), vp,
787 ("vp inconsistent on freelist"));
788 freevnodes--;
789 vp->v_iflag &= ~VI_FREE;
790 vholdl(vp);
791 mtx_unlock(&vnode_free_list_mtx);
792 VI_UNLOCK(vp);
793 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
794 vtryrecycle(vp);
795 VFS_UNLOCK_GIANT(vfslocked);
796 /*
797 * If the recycled succeeded this vdrop will actually free
798 * the vnode. If not it will simply place it back on
799 * the free list.
800 */
801 vdrop(vp);
802 mtx_lock(&vnode_free_list_mtx);
803 }
804 }
805 /*
806 * Attempt to recycle vnodes in a context that is always safe to block.
807 * Calling vlrurecycle() from the bowels of filesystem code has some
808 * interesting deadlock problems.
809 */
810 static struct proc *vnlruproc;
811 static int vnlruproc_sig;
812
813 static void
814 vnlru_proc(void)
815 {
816 struct mount *mp, *nmp;
817 int done, vfslocked;
818 struct proc *p = vnlruproc;
819
820 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
821 SHUTDOWN_PRI_FIRST);
822
823 for (;;) {
824 kproc_suspend_check(p);
825 mtx_lock(&vnode_free_list_mtx);
826 if (freevnodes > wantfreevnodes)
827 vnlru_free(freevnodes - wantfreevnodes);
828 if (numvnodes <= desiredvnodes * 9 / 10) {
829 vnlruproc_sig = 0;
830 wakeup(&vnlruproc_sig);
831 msleep(vnlruproc, &vnode_free_list_mtx,
832 PVFS|PDROP, "vlruwt", hz);
833 continue;
834 }
835 mtx_unlock(&vnode_free_list_mtx);
836 done = 0;
837 mtx_lock(&mountlist_mtx);
838 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
839 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
840 nmp = TAILQ_NEXT(mp, mnt_list);
841 continue;
842 }
843 vfslocked = VFS_LOCK_GIANT(mp);
844 done += vlrureclaim(mp);
845 VFS_UNLOCK_GIANT(vfslocked);
846 mtx_lock(&mountlist_mtx);
847 nmp = TAILQ_NEXT(mp, mnt_list);
848 vfs_unbusy(mp);
849 }
850 mtx_unlock(&mountlist_mtx);
851 if (done == 0) {
852 #if 0
853 /* These messages are temporary debugging aids */
854 if (vnlru_nowhere < 5)
855 printf("vnlru process getting nowhere..\n");
856 else if (vnlru_nowhere == 5)
857 printf("vnlru process messages stopped.\n");
858 #endif
859 vnlru_nowhere++;
860 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
861 } else
862 kern_yield(PRI_UNCHANGED);
863 }
864 }
865
866 static struct kproc_desc vnlru_kp = {
867 "vnlru",
868 vnlru_proc,
869 &vnlruproc
870 };
871 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
872 &vnlru_kp);
873
874 /*
875 * Routines having to do with the management of the vnode table.
876 */
877
878 /*
879 * Try to recycle a freed vnode. We abort if anyone picks up a reference
880 * before we actually vgone(). This function must be called with the vnode
881 * held to prevent the vnode from being returned to the free list midway
882 * through vgone().
883 */
884 static int
885 vtryrecycle(struct vnode *vp)
886 {
887 struct mount *vnmp;
888
889 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
890 VNASSERT(vp->v_holdcnt, vp,
891 ("vtryrecycle: Recycling vp %p without a reference.", vp));
892 /*
893 * This vnode may found and locked via some other list, if so we
894 * can't recycle it yet.
895 */
896 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
897 CTR2(KTR_VFS,
898 "%s: impossible to recycle, vp %p lock is already held",
899 __func__, vp);
900 return (EWOULDBLOCK);
901 }
902 /*
903 * Don't recycle if its filesystem is being suspended.
904 */
905 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
906 VOP_UNLOCK(vp, 0);
907 CTR2(KTR_VFS,
908 "%s: impossible to recycle, cannot start the write for %p",
909 __func__, vp);
910 return (EBUSY);
911 }
912 /*
913 * If we got this far, we need to acquire the interlock and see if
914 * anyone picked up this vnode from another list. If not, we will
915 * mark it with DOOMED via vgonel() so that anyone who does find it
916 * will skip over it.
917 */
918 VI_LOCK(vp);
919 if (vp->v_usecount) {
920 VOP_UNLOCK(vp, LK_INTERLOCK);
921 vn_finished_write(vnmp);
922 CTR2(KTR_VFS,
923 "%s: impossible to recycle, %p is already referenced",
924 __func__, vp);
925 return (EBUSY);
926 }
927 if ((vp->v_iflag & VI_DOOMED) == 0)
928 vgonel(vp);
929 VOP_UNLOCK(vp, LK_INTERLOCK);
930 vn_finished_write(vnmp);
931 return (0);
932 }
933
934 /*
935 * Return the next vnode from the free list.
936 */
937 int
938 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
939 struct vnode **vpp)
940 {
941 struct vnode *vp = NULL;
942 struct bufobj *bo;
943
944 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
945 mtx_lock(&vnode_free_list_mtx);
946 /*
947 * Lend our context to reclaim vnodes if they've exceeded the max.
948 */
949 if (freevnodes > wantfreevnodes)
950 vnlru_free(1);
951 /*
952 * Wait for available vnodes.
953 */
954 if (numvnodes > desiredvnodes) {
955 if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
956 /*
957 * File system is beeing suspended, we cannot risk a
958 * deadlock here, so allocate new vnode anyway.
959 */
960 if (freevnodes > wantfreevnodes)
961 vnlru_free(freevnodes - wantfreevnodes);
962 goto alloc;
963 }
964 if (vnlruproc_sig == 0) {
965 vnlruproc_sig = 1; /* avoid unnecessary wakeups */
966 wakeup(vnlruproc);
967 }
968 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
969 "vlruwk", hz);
970 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
971 if (numvnodes > desiredvnodes) {
972 mtx_unlock(&vnode_free_list_mtx);
973 return (ENFILE);
974 }
975 #endif
976 }
977 alloc:
978 numvnodes++;
979 mtx_unlock(&vnode_free_list_mtx);
980 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
981 /*
982 * Setup locks.
983 */
984 vp->v_vnlock = &vp->v_lock;
985 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
986 /*
987 * By default, don't allow shared locks unless filesystems
988 * opt-in.
989 */
990 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
991 /*
992 * Initialize bufobj.
993 */
994 bo = &vp->v_bufobj;
995 bo->__bo_vnode = vp;
996 mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
997 bo->bo_ops = &buf_ops_bio;
998 bo->bo_private = vp;
999 TAILQ_INIT(&bo->bo_clean.bv_hd);
1000 TAILQ_INIT(&bo->bo_dirty.bv_hd);
1001 /*
1002 * Initialize namecache.
1003 */
1004 LIST_INIT(&vp->v_cache_src);
1005 TAILQ_INIT(&vp->v_cache_dst);
1006 /*
1007 * Finalize various vnode identity bits.
1008 */
1009 vp->v_type = VNON;
1010 vp->v_tag = tag;
1011 vp->v_op = vops;
1012 v_incr_usecount(vp);
1013 vp->v_data = NULL;
1014 #ifdef MAC
1015 mac_vnode_init(vp);
1016 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1017 mac_vnode_associate_singlelabel(mp, vp);
1018 else if (mp == NULL && vops != &dead_vnodeops)
1019 printf("NULL mp in getnewvnode()\n");
1020 #endif
1021 if (mp != NULL) {
1022 bo->bo_bsize = mp->mnt_stat.f_iosize;
1023 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1024 vp->v_vflag |= VV_NOKNOTE;
1025 }
1026
1027 *vpp = vp;
1028 return (0);
1029 }
1030
1031 /*
1032 * Delete from old mount point vnode list, if on one.
1033 */
1034 static void
1035 delmntque(struct vnode *vp)
1036 {
1037 struct mount *mp;
1038
1039 mp = vp->v_mount;
1040 if (mp == NULL)
1041 return;
1042 MNT_ILOCK(mp);
1043 vp->v_mount = NULL;
1044 VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1045 ("bad mount point vnode list size"));
1046 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1047 mp->mnt_nvnodelistsize--;
1048 MNT_REL(mp);
1049 MNT_IUNLOCK(mp);
1050 }
1051
1052 static void
1053 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1054 {
1055
1056 vp->v_data = NULL;
1057 vp->v_op = &dead_vnodeops;
1058 /* XXX non mp-safe fs may still call insmntque with vnode
1059 unlocked */
1060 if (!VOP_ISLOCKED(vp))
1061 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1062 vgone(vp);
1063 vput(vp);
1064 }
1065
1066 /*
1067 * Insert into list of vnodes for the new mount point, if available.
1068 */
1069 int
1070 insmntque1(struct vnode *vp, struct mount *mp,
1071 void (*dtr)(struct vnode *, void *), void *dtr_arg)
1072 {
1073 int locked;
1074
1075 KASSERT(vp->v_mount == NULL,
1076 ("insmntque: vnode already on per mount vnode list"));
1077 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1078 #ifdef DEBUG_VFS_LOCKS
1079 if (!VFS_NEEDSGIANT(mp))
1080 ASSERT_VOP_ELOCKED(vp,
1081 "insmntque: mp-safe fs and non-locked vp");
1082 #endif
1083 MNT_ILOCK(mp);
1084 if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1085 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1086 mp->mnt_nvnodelistsize == 0)) {
1087 locked = VOP_ISLOCKED(vp);
1088 if (!locked || (locked == LK_EXCLUSIVE &&
1089 (vp->v_vflag & VV_FORCEINSMQ) == 0)) {
1090 MNT_IUNLOCK(mp);
1091 if (dtr != NULL)
1092 dtr(vp, dtr_arg);
1093 return (EBUSY);
1094 }
1095 }
1096 vp->v_mount = mp;
1097 MNT_REF(mp);
1098 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1099 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1100 ("neg mount point vnode list size"));
1101 mp->mnt_nvnodelistsize++;
1102 MNT_IUNLOCK(mp);
1103 return (0);
1104 }
1105
1106 int
1107 insmntque(struct vnode *vp, struct mount *mp)
1108 {
1109
1110 return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1111 }
1112
1113 /*
1114 * Flush out and invalidate all buffers associated with a bufobj
1115 * Called with the underlying object locked.
1116 */
1117 int
1118 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1119 {
1120 int error;
1121
1122 BO_LOCK(bo);
1123 if (flags & V_SAVE) {
1124 error = bufobj_wwait(bo, slpflag, slptimeo);
1125 if (error) {
1126 BO_UNLOCK(bo);
1127 return (error);
1128 }
1129 if (bo->bo_dirty.bv_cnt > 0) {
1130 BO_UNLOCK(bo);
1131 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1132 return (error);
1133 /*
1134 * XXX We could save a lock/unlock if this was only
1135 * enabled under INVARIANTS
1136 */
1137 BO_LOCK(bo);
1138 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1139 panic("vinvalbuf: dirty bufs");
1140 }
1141 }
1142 /*
1143 * If you alter this loop please notice that interlock is dropped and
1144 * reacquired in flushbuflist. Special care is needed to ensure that
1145 * no race conditions occur from this.
1146 */
1147 do {
1148 error = flushbuflist(&bo->bo_clean,
1149 flags, bo, slpflag, slptimeo);
1150 if (error == 0 && !(flags & V_CLEANONLY))
1151 error = flushbuflist(&bo->bo_dirty,
1152 flags, bo, slpflag, slptimeo);
1153 if (error != 0 && error != EAGAIN) {
1154 BO_UNLOCK(bo);
1155 return (error);
1156 }
1157 } while (error != 0);
1158
1159 /*
1160 * Wait for I/O to complete. XXX needs cleaning up. The vnode can
1161 * have write I/O in-progress but if there is a VM object then the
1162 * VM object can also have read-I/O in-progress.
1163 */
1164 do {
1165 bufobj_wwait(bo, 0, 0);
1166 BO_UNLOCK(bo);
1167 if (bo->bo_object != NULL) {
1168 VM_OBJECT_LOCK(bo->bo_object);
1169 vm_object_pip_wait(bo->bo_object, "bovlbx");
1170 VM_OBJECT_UNLOCK(bo->bo_object);
1171 }
1172 BO_LOCK(bo);
1173 } while (bo->bo_numoutput > 0);
1174 BO_UNLOCK(bo);
1175
1176 /*
1177 * Destroy the copy in the VM cache, too.
1178 */
1179 if (bo->bo_object != NULL &&
1180 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1181 VM_OBJECT_LOCK(bo->bo_object);
1182 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1183 OBJPR_CLEANONLY : 0);
1184 VM_OBJECT_UNLOCK(bo->bo_object);
1185 }
1186
1187 #ifdef INVARIANTS
1188 BO_LOCK(bo);
1189 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1190 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1191 panic("vinvalbuf: flush failed");
1192 BO_UNLOCK(bo);
1193 #endif
1194 return (0);
1195 }
1196
1197 /*
1198 * Flush out and invalidate all buffers associated with a vnode.
1199 * Called with the underlying object locked.
1200 */
1201 int
1202 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1203 {
1204
1205 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1206 ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1207 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1208 }
1209
1210 /*
1211 * Flush out buffers on the specified list.
1212 *
1213 */
1214 static int
1215 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1216 int slptimeo)
1217 {
1218 struct buf *bp, *nbp;
1219 int retval, error;
1220 daddr_t lblkno;
1221 b_xflags_t xflags;
1222
1223 ASSERT_BO_LOCKED(bo);
1224
1225 retval = 0;
1226 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1227 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1228 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1229 continue;
1230 }
1231 lblkno = 0;
1232 xflags = 0;
1233 if (nbp != NULL) {
1234 lblkno = nbp->b_lblkno;
1235 xflags = nbp->b_xflags &
1236 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1237 }
1238 retval = EAGAIN;
1239 error = BUF_TIMELOCK(bp,
1240 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1241 "flushbuf", slpflag, slptimeo);
1242 if (error) {
1243 BO_LOCK(bo);
1244 return (error != ENOLCK ? error : EAGAIN);
1245 }
1246 KASSERT(bp->b_bufobj == bo,
1247 ("bp %p wrong b_bufobj %p should be %p",
1248 bp, bp->b_bufobj, bo));
1249 if (bp->b_bufobj != bo) { /* XXX: necessary ? */
1250 BUF_UNLOCK(bp);
1251 BO_LOCK(bo);
1252 return (EAGAIN);
1253 }
1254 /*
1255 * XXX Since there are no node locks for NFS, I
1256 * believe there is a slight chance that a delayed
1257 * write will occur while sleeping just above, so
1258 * check for it.
1259 */
1260 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1261 (flags & V_SAVE)) {
1262 BO_LOCK(bo);
1263 bremfree(bp);
1264 BO_UNLOCK(bo);
1265 bp->b_flags |= B_ASYNC;
1266 bwrite(bp);
1267 BO_LOCK(bo);
1268 return (EAGAIN); /* XXX: why not loop ? */
1269 }
1270 BO_LOCK(bo);
1271 bremfree(bp);
1272 BO_UNLOCK(bo);
1273 bp->b_flags |= (B_INVAL | B_RELBUF);
1274 bp->b_flags &= ~B_ASYNC;
1275 brelse(bp);
1276 BO_LOCK(bo);
1277 if (nbp != NULL &&
1278 (nbp->b_bufobj != bo ||
1279 nbp->b_lblkno != lblkno ||
1280 (nbp->b_xflags &
1281 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1282 break; /* nbp invalid */
1283 }
1284 return (retval);
1285 }
1286
1287 /*
1288 * Truncate a file's buffer and pages to a specified length. This
1289 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1290 * sync activity.
1291 */
1292 int
1293 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
1294 off_t length, int blksize)
1295 {
1296 struct buf *bp, *nbp;
1297 int anyfreed;
1298 int trunclbn;
1299 struct bufobj *bo;
1300
1301 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1302 vp, cred, blksize, (uintmax_t)length);
1303
1304 /*
1305 * Round up to the *next* lbn.
1306 */
1307 trunclbn = (length + blksize - 1) / blksize;
1308
1309 ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1310 restart:
1311 bo = &vp->v_bufobj;
1312 BO_LOCK(bo);
1313 anyfreed = 1;
1314 for (;anyfreed;) {
1315 anyfreed = 0;
1316 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1317 if (bp->b_lblkno < trunclbn)
1318 continue;
1319 if (BUF_LOCK(bp,
1320 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1321 BO_MTX(bo)) == ENOLCK)
1322 goto restart;
1323
1324 BO_LOCK(bo);
1325 bremfree(bp);
1326 BO_UNLOCK(bo);
1327 bp->b_flags |= (B_INVAL | B_RELBUF);
1328 bp->b_flags &= ~B_ASYNC;
1329 brelse(bp);
1330 anyfreed = 1;
1331
1332 BO_LOCK(bo);
1333 if (nbp != NULL &&
1334 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1335 (nbp->b_vp != vp) ||
1336 (nbp->b_flags & B_DELWRI))) {
1337 BO_UNLOCK(bo);
1338 goto restart;
1339 }
1340 }
1341
1342 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1343 if (bp->b_lblkno < trunclbn)
1344 continue;
1345 if (BUF_LOCK(bp,
1346 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1347 BO_MTX(bo)) == ENOLCK)
1348 goto restart;
1349 BO_LOCK(bo);
1350 bremfree(bp);
1351 BO_UNLOCK(bo);
1352 bp->b_flags |= (B_INVAL | B_RELBUF);
1353 bp->b_flags &= ~B_ASYNC;
1354 brelse(bp);
1355 anyfreed = 1;
1356
1357 BO_LOCK(bo);
1358 if (nbp != NULL &&
1359 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1360 (nbp->b_vp != vp) ||
1361 (nbp->b_flags & B_DELWRI) == 0)) {
1362 BO_UNLOCK(bo);
1363 goto restart;
1364 }
1365 }
1366 }
1367
1368 if (length > 0) {
1369 restartsync:
1370 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1371 if (bp->b_lblkno > 0)
1372 continue;
1373 /*
1374 * Since we hold the vnode lock this should only
1375 * fail if we're racing with the buf daemon.
1376 */
1377 if (BUF_LOCK(bp,
1378 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1379 BO_MTX(bo)) == ENOLCK) {
1380 goto restart;
1381 }
1382 VNASSERT((bp->b_flags & B_DELWRI), vp,
1383 ("buf(%p) on dirty queue without DELWRI", bp));
1384
1385 BO_LOCK(bo);
1386 bremfree(bp);
1387 BO_UNLOCK(bo);
1388 bawrite(bp);
1389 BO_LOCK(bo);
1390 goto restartsync;
1391 }
1392 }
1393
1394 bufobj_wwait(bo, 0, 0);
1395 BO_UNLOCK(bo);
1396 vnode_pager_setsize(vp, length);
1397
1398 return (0);
1399 }
1400
1401 /*
1402 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1403 * a vnode.
1404 *
1405 * NOTE: We have to deal with the special case of a background bitmap
1406 * buffer, a situation where two buffers will have the same logical
1407 * block offset. We want (1) only the foreground buffer to be accessed
1408 * in a lookup and (2) must differentiate between the foreground and
1409 * background buffer in the splay tree algorithm because the splay
1410 * tree cannot normally handle multiple entities with the same 'index'.
1411 * We accomplish this by adding differentiating flags to the splay tree's
1412 * numerical domain.
1413 */
1414 static
1415 struct buf *
1416 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1417 {
1418 struct buf dummy;
1419 struct buf *lefttreemax, *righttreemin, *y;
1420
1421 if (root == NULL)
1422 return (NULL);
1423 lefttreemax = righttreemin = &dummy;
1424 for (;;) {
1425 if (lblkno < root->b_lblkno ||
1426 (lblkno == root->b_lblkno &&
1427 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1428 if ((y = root->b_left) == NULL)
1429 break;
1430 if (lblkno < y->b_lblkno) {
1431 /* Rotate right. */
1432 root->b_left = y->b_right;
1433 y->b_right = root;
1434 root = y;
1435 if ((y = root->b_left) == NULL)
1436 break;
1437 }
1438 /* Link into the new root's right tree. */
1439 righttreemin->b_left = root;
1440 righttreemin = root;
1441 } else if (lblkno > root->b_lblkno ||
1442 (lblkno == root->b_lblkno &&
1443 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1444 if ((y = root->b_right) == NULL)
1445 break;
1446 if (lblkno > y->b_lblkno) {
1447 /* Rotate left. */
1448 root->b_right = y->b_left;
1449 y->b_left = root;
1450 root = y;
1451 if ((y = root->b_right) == NULL)
1452 break;
1453 }
1454 /* Link into the new root's left tree. */
1455 lefttreemax->b_right = root;
1456 lefttreemax = root;
1457 } else {
1458 break;
1459 }
1460 root = y;
1461 }
1462 /* Assemble the new root. */
1463 lefttreemax->b_right = root->b_left;
1464 righttreemin->b_left = root->b_right;
1465 root->b_left = dummy.b_right;
1466 root->b_right = dummy.b_left;
1467 return (root);
1468 }
1469
1470 static void
1471 buf_vlist_remove(struct buf *bp)
1472 {
1473 struct buf *root;
1474 struct bufv *bv;
1475
1476 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1477 ASSERT_BO_LOCKED(bp->b_bufobj);
1478 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1479 (BX_VNDIRTY|BX_VNCLEAN),
1480 ("buf_vlist_remove: Buf %p is on two lists", bp));
1481 if (bp->b_xflags & BX_VNDIRTY)
1482 bv = &bp->b_bufobj->bo_dirty;
1483 else
1484 bv = &bp->b_bufobj->bo_clean;
1485 if (bp != bv->bv_root) {
1486 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1487 KASSERT(root == bp, ("splay lookup failed in remove"));
1488 }
1489 if (bp->b_left == NULL) {
1490 root = bp->b_right;
1491 } else {
1492 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1493 root->b_right = bp->b_right;
1494 }
1495 bv->bv_root = root;
1496 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1497 bv->bv_cnt--;
1498 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1499 }
1500
1501 /*
1502 * Add the buffer to the sorted clean or dirty block list using a
1503 * splay tree algorithm.
1504 *
1505 * NOTE: xflags is passed as a constant, optimizing this inline function!
1506 */
1507 static void
1508 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1509 {
1510 struct buf *root;
1511 struct bufv *bv;
1512
1513 ASSERT_BO_LOCKED(bo);
1514 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1515 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1516 bp->b_xflags |= xflags;
1517 if (xflags & BX_VNDIRTY)
1518 bv = &bo->bo_dirty;
1519 else
1520 bv = &bo->bo_clean;
1521
1522 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1523 if (root == NULL) {
1524 bp->b_left = NULL;
1525 bp->b_right = NULL;
1526 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1527 } else if (bp->b_lblkno < root->b_lblkno ||
1528 (bp->b_lblkno == root->b_lblkno &&
1529 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1530 bp->b_left = root->b_left;
1531 bp->b_right = root;
1532 root->b_left = NULL;
1533 TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1534 } else {
1535 bp->b_right = root->b_right;
1536 bp->b_left = root;
1537 root->b_right = NULL;
1538 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1539 }
1540 bv->bv_cnt++;
1541 bv->bv_root = bp;
1542 }
1543
1544 /*
1545 * Lookup a buffer using the splay tree. Note that we specifically avoid
1546 * shadow buffers used in background bitmap writes.
1547 *
1548 * This code isn't quite efficient as it could be because we are maintaining
1549 * two sorted lists and do not know which list the block resides in.
1550 *
1551 * During a "make buildworld" the desired buffer is found at one of
1552 * the roots more than 60% of the time. Thus, checking both roots
1553 * before performing either splay eliminates unnecessary splays on the
1554 * first tree splayed.
1555 */
1556 struct buf *
1557 gbincore(struct bufobj *bo, daddr_t lblkno)
1558 {
1559 struct buf *bp;
1560
1561 ASSERT_BO_LOCKED(bo);
1562 if ((bp = bo->bo_clean.bv_root) != NULL &&
1563 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1564 return (bp);
1565 if ((bp = bo->bo_dirty.bv_root) != NULL &&
1566 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1567 return (bp);
1568 if ((bp = bo->bo_clean.bv_root) != NULL) {
1569 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1570 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1571 return (bp);
1572 }
1573 if ((bp = bo->bo_dirty.bv_root) != NULL) {
1574 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1575 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1576 return (bp);
1577 }
1578 return (NULL);
1579 }
1580
1581 /*
1582 * Associate a buffer with a vnode.
1583 */
1584 void
1585 bgetvp(struct vnode *vp, struct buf *bp)
1586 {
1587 struct bufobj *bo;
1588
1589 bo = &vp->v_bufobj;
1590 ASSERT_BO_LOCKED(bo);
1591 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1592
1593 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1594 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1595 ("bgetvp: bp already attached! %p", bp));
1596
1597 vhold(vp);
1598 if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
1599 bp->b_flags |= B_NEEDSGIANT;
1600 bp->b_vp = vp;
1601 bp->b_bufobj = bo;
1602 /*
1603 * Insert onto list for new vnode.
1604 */
1605 buf_vlist_add(bp, bo, BX_VNCLEAN);
1606 }
1607
1608 /*
1609 * Disassociate a buffer from a vnode.
1610 */
1611 void
1612 brelvp(struct buf *bp)
1613 {
1614 struct bufobj *bo;
1615 struct vnode *vp;
1616
1617 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1618 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1619
1620 /*
1621 * Delete from old vnode list, if on one.
1622 */
1623 vp = bp->b_vp; /* XXX */
1624 bo = bp->b_bufobj;
1625 BO_LOCK(bo);
1626 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1627 buf_vlist_remove(bp);
1628 else
1629 panic("brelvp: Buffer %p not on queue.", bp);
1630 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1631 bo->bo_flag &= ~BO_ONWORKLST;
1632 mtx_lock(&sync_mtx);
1633 LIST_REMOVE(bo, bo_synclist);
1634 syncer_worklist_len--;
1635 mtx_unlock(&sync_mtx);
1636 }
1637 bp->b_flags &= ~B_NEEDSGIANT;
1638 bp->b_vp = NULL;
1639 bp->b_bufobj = NULL;
1640 BO_UNLOCK(bo);
1641 vdrop(vp);
1642 }
1643
1644 /*
1645 * Add an item to the syncer work queue.
1646 */
1647 static void
1648 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1649 {
1650 int queue, slot;
1651
1652 ASSERT_BO_LOCKED(bo);
1653
1654 mtx_lock(&sync_mtx);
1655 if (bo->bo_flag & BO_ONWORKLST)
1656 LIST_REMOVE(bo, bo_synclist);
1657 else {
1658 bo->bo_flag |= BO_ONWORKLST;
1659 syncer_worklist_len++;
1660 }
1661
1662 if (delay > syncer_maxdelay - 2)
1663 delay = syncer_maxdelay - 2;
1664 slot = (syncer_delayno + delay) & syncer_mask;
1665
1666 queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
1667 WI_MPSAFEQ;
1668 LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
1669 bo_synclist);
1670 mtx_unlock(&sync_mtx);
1671 }
1672
1673 static int
1674 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1675 {
1676 int error, len;
1677
1678 mtx_lock(&sync_mtx);
1679 len = syncer_worklist_len - sync_vnode_count;
1680 mtx_unlock(&sync_mtx);
1681 error = SYSCTL_OUT(req, &len, sizeof(len));
1682 return (error);
1683 }
1684
1685 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1686 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1687
1688 static struct proc *updateproc;
1689 static void sched_sync(void);
1690 static struct kproc_desc up_kp = {
1691 "syncer",
1692 sched_sync,
1693 &updateproc
1694 };
1695 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1696
1697 static int
1698 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1699 {
1700 struct vnode *vp;
1701 struct mount *mp;
1702
1703 *bo = LIST_FIRST(slp);
1704 if (*bo == NULL)
1705 return (0);
1706 vp = (*bo)->__bo_vnode; /* XXX */
1707 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1708 return (1);
1709 /*
1710 * We use vhold in case the vnode does not
1711 * successfully sync. vhold prevents the vnode from
1712 * going away when we unlock the sync_mtx so that
1713 * we can acquire the vnode interlock.
1714 */
1715 vholdl(vp);
1716 mtx_unlock(&sync_mtx);
1717 VI_UNLOCK(vp);
1718 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1719 vdrop(vp);
1720 mtx_lock(&sync_mtx);
1721 return (*bo == LIST_FIRST(slp));
1722 }
1723 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1724 (void) VOP_FSYNC(vp, MNT_LAZY, td);
1725 VOP_UNLOCK(vp, 0);
1726 vn_finished_write(mp);
1727 BO_LOCK(*bo);
1728 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1729 /*
1730 * Put us back on the worklist. The worklist
1731 * routine will remove us from our current
1732 * position and then add us back in at a later
1733 * position.
1734 */
1735 vn_syncer_add_to_worklist(*bo, syncdelay);
1736 }
1737 BO_UNLOCK(*bo);
1738 vdrop(vp);
1739 mtx_lock(&sync_mtx);
1740 return (0);
1741 }
1742
1743 /*
1744 * System filesystem synchronizer daemon.
1745 */
1746 static void
1747 sched_sync(void)
1748 {
1749 struct synclist *gnext, *next;
1750 struct synclist *gslp, *slp;
1751 struct bufobj *bo;
1752 long starttime;
1753 struct thread *td = curthread;
1754 int last_work_seen;
1755 int net_worklist_len;
1756 int syncer_final_iter;
1757 int first_printf;
1758 int error;
1759
1760 last_work_seen = 0;
1761 syncer_final_iter = 0;
1762 first_printf = 1;
1763 syncer_state = SYNCER_RUNNING;
1764 starttime = time_uptime;
1765 td->td_pflags |= TDP_NORUNNINGBUF;
1766
1767 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1768 SHUTDOWN_PRI_LAST);
1769
1770 mtx_lock(&sync_mtx);
1771 for (;;) {
1772 if (syncer_state == SYNCER_FINAL_DELAY &&
1773 syncer_final_iter == 0) {
1774 mtx_unlock(&sync_mtx);
1775 kproc_suspend_check(td->td_proc);
1776 mtx_lock(&sync_mtx);
1777 }
1778 net_worklist_len = syncer_worklist_len - sync_vnode_count;
1779 if (syncer_state != SYNCER_RUNNING &&
1780 starttime != time_uptime) {
1781 if (first_printf) {
1782 printf("\nSyncing disks, vnodes remaining...");
1783 first_printf = 0;
1784 }
1785 printf("%d ", net_worklist_len);
1786 }
1787 starttime = time_uptime;
1788
1789 /*
1790 * Push files whose dirty time has expired. Be careful
1791 * of interrupt race on slp queue.
1792 *
1793 * Skip over empty worklist slots when shutting down.
1794 */
1795 do {
1796 slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1797 gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1798 syncer_delayno += 1;
1799 if (syncer_delayno == syncer_maxdelay)
1800 syncer_delayno = 0;
1801 next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1802 gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1803 /*
1804 * If the worklist has wrapped since the
1805 * it was emptied of all but syncer vnodes,
1806 * switch to the FINAL_DELAY state and run
1807 * for one more second.
1808 */
1809 if (syncer_state == SYNCER_SHUTTING_DOWN &&
1810 net_worklist_len == 0 &&
1811 last_work_seen == syncer_delayno) {
1812 syncer_state = SYNCER_FINAL_DELAY;
1813 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1814 }
1815 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1816 LIST_EMPTY(gslp) && syncer_worklist_len > 0);
1817
1818 /*
1819 * Keep track of the last time there was anything
1820 * on the worklist other than syncer vnodes.
1821 * Return to the SHUTTING_DOWN state if any
1822 * new work appears.
1823 */
1824 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1825 last_work_seen = syncer_delayno;
1826 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1827 syncer_state = SYNCER_SHUTTING_DOWN;
1828 while (!LIST_EMPTY(slp)) {
1829 error = sync_vnode(slp, &bo, td);
1830 if (error == 1) {
1831 LIST_REMOVE(bo, bo_synclist);
1832 LIST_INSERT_HEAD(next, bo, bo_synclist);
1833 continue;
1834 }
1835 #ifdef SW_WATCHDOG
1836 if (first_printf == 0)
1837 wdog_kern_pat(WD_LASTVAL);
1838 #endif
1839 }
1840 if (!LIST_EMPTY(gslp)) {
1841 mtx_unlock(&sync_mtx);
1842 mtx_lock(&Giant);
1843 mtx_lock(&sync_mtx);
1844 while (!LIST_EMPTY(gslp)) {
1845 error = sync_vnode(gslp, &bo, td);
1846 if (error == 1) {
1847 LIST_REMOVE(bo, bo_synclist);
1848 LIST_INSERT_HEAD(gnext, bo,
1849 bo_synclist);
1850 continue;
1851 }
1852 }
1853 mtx_unlock(&Giant);
1854 }
1855 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1856 syncer_final_iter--;
1857 /*
1858 * The variable rushjob allows the kernel to speed up the
1859 * processing of the filesystem syncer process. A rushjob
1860 * value of N tells the filesystem syncer to process the next
1861 * N seconds worth of work on its queue ASAP. Currently rushjob
1862 * is used by the soft update code to speed up the filesystem
1863 * syncer process when the incore state is getting so far
1864 * ahead of the disk that the kernel memory pool is being
1865 * threatened with exhaustion.
1866 */
1867 if (rushjob > 0) {
1868 rushjob -= 1;
1869 continue;
1870 }
1871 /*
1872 * Just sleep for a short period of time between
1873 * iterations when shutting down to allow some I/O
1874 * to happen.
1875 *
1876 * If it has taken us less than a second to process the
1877 * current work, then wait. Otherwise start right over
1878 * again. We can still lose time if any single round
1879 * takes more than two seconds, but it does not really
1880 * matter as we are just trying to generally pace the
1881 * filesystem activity.
1882 */
1883 if (syncer_state != SYNCER_RUNNING ||
1884 time_uptime == starttime) {
1885 thread_lock(td);
1886 sched_prio(td, PPAUSE);
1887 thread_unlock(td);
1888 }
1889 if (syncer_state != SYNCER_RUNNING)
1890 cv_timedwait(&sync_wakeup, &sync_mtx,
1891 hz / SYNCER_SHUTDOWN_SPEEDUP);
1892 else if (time_uptime == starttime)
1893 cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1894 }
1895 }
1896
1897 /*
1898 * Request the syncer daemon to speed up its work.
1899 * We never push it to speed up more than half of its
1900 * normal turn time, otherwise it could take over the cpu.
1901 */
1902 int
1903 speedup_syncer(void)
1904 {
1905 int ret = 0;
1906
1907 mtx_lock(&sync_mtx);
1908 if (rushjob < syncdelay / 2) {
1909 rushjob += 1;
1910 stat_rush_requests += 1;
1911 ret = 1;
1912 }
1913 mtx_unlock(&sync_mtx);
1914 cv_broadcast(&sync_wakeup);
1915 return (ret);
1916 }
1917
1918 /*
1919 * Tell the syncer to speed up its work and run though its work
1920 * list several times, then tell it to shut down.
1921 */
1922 static void
1923 syncer_shutdown(void *arg, int howto)
1924 {
1925
1926 if (howto & RB_NOSYNC)
1927 return;
1928 mtx_lock(&sync_mtx);
1929 syncer_state = SYNCER_SHUTTING_DOWN;
1930 rushjob = 0;
1931 mtx_unlock(&sync_mtx);
1932 cv_broadcast(&sync_wakeup);
1933 kproc_shutdown(arg, howto);
1934 }
1935
1936 /*
1937 * Reassign a buffer from one vnode to another.
1938 * Used to assign file specific control information
1939 * (indirect blocks) to the vnode to which they belong.
1940 */
1941 void
1942 reassignbuf(struct buf *bp)
1943 {
1944 struct vnode *vp;
1945 struct bufobj *bo;
1946 int delay;
1947 #ifdef INVARIANTS
1948 struct bufv *bv;
1949 #endif
1950
1951 vp = bp->b_vp;
1952 bo = bp->b_bufobj;
1953 ++reassignbufcalls;
1954
1955 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1956 bp, bp->b_vp, bp->b_flags);
1957 /*
1958 * B_PAGING flagged buffers cannot be reassigned because their vp
1959 * is not fully linked in.
1960 */
1961 if (bp->b_flags & B_PAGING)
1962 panic("cannot reassign paging buffer");
1963
1964 /*
1965 * Delete from old vnode list, if on one.
1966 */
1967 BO_LOCK(bo);
1968 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1969 buf_vlist_remove(bp);
1970 else
1971 panic("reassignbuf: Buffer %p not on queue.", bp);
1972 /*
1973 * If dirty, put on list of dirty buffers; otherwise insert onto list
1974 * of clean buffers.
1975 */
1976 if (bp->b_flags & B_DELWRI) {
1977 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1978 switch (vp->v_type) {
1979 case VDIR:
1980 delay = dirdelay;
1981 break;
1982 case VCHR:
1983 delay = metadelay;
1984 break;
1985 default:
1986 delay = filedelay;
1987 }
1988 vn_syncer_add_to_worklist(bo, delay);
1989 }
1990 buf_vlist_add(bp, bo, BX_VNDIRTY);
1991 } else {
1992 buf_vlist_add(bp, bo, BX_VNCLEAN);
1993
1994 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1995 mtx_lock(&sync_mtx);
1996 LIST_REMOVE(bo, bo_synclist);
1997 syncer_worklist_len--;
1998 mtx_unlock(&sync_mtx);
1999 bo->bo_flag &= ~BO_ONWORKLST;
2000 }
2001 }
2002 #ifdef INVARIANTS
2003 bv = &bo->bo_clean;
2004 bp = TAILQ_FIRST(&bv->bv_hd);
2005 KASSERT(bp == NULL || bp->b_bufobj == bo,
2006 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2007 bp = TAILQ_LAST(&bv->bv_hd, buflists);
2008 KASSERT(bp == NULL || bp->b_bufobj == bo,
2009 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2010 bv = &bo->bo_dirty;
2011 bp = TAILQ_FIRST(&bv->bv_hd);
2012 KASSERT(bp == NULL || bp->b_bufobj == bo,
2013 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2014 bp = TAILQ_LAST(&bv->bv_hd, buflists);
2015 KASSERT(bp == NULL || bp->b_bufobj == bo,
2016 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2017 #endif
2018 BO_UNLOCK(bo);
2019 }
2020
2021 /*
2022 * Increment the use and hold counts on the vnode, taking care to reference
2023 * the driver's usecount if this is a chardev. The vholdl() will remove
2024 * the vnode from the free list if it is presently free. Requires the
2025 * vnode interlock and returns with it held.
2026 */
2027 static void
2028 v_incr_usecount(struct vnode *vp)
2029 {
2030
2031 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2032 vp->v_usecount++;
2033 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2034 dev_lock();
2035 vp->v_rdev->si_usecount++;
2036 dev_unlock();
2037 }
2038 vholdl(vp);
2039 }
2040
2041 /*
2042 * Turn a holdcnt into a use+holdcnt such that only one call to
2043 * v_decr_usecount is needed.
2044 */
2045 static void
2046 v_upgrade_usecount(struct vnode *vp)
2047 {
2048
2049 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2050 vp->v_usecount++;
2051 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2052 dev_lock();
2053 vp->v_rdev->si_usecount++;
2054 dev_unlock();
2055 }
2056 }
2057
2058 /*
2059 * Decrement the vnode use and hold count along with the driver's usecount
2060 * if this is a chardev. The vdropl() below releases the vnode interlock
2061 * as it may free the vnode.
2062 */
2063 static void
2064 v_decr_usecount(struct vnode *vp)
2065 {
2066
2067 ASSERT_VI_LOCKED(vp, __FUNCTION__);
2068 VNASSERT(vp->v_usecount > 0, vp,
2069 ("v_decr_usecount: negative usecount"));
2070 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2071 vp->v_usecount--;
2072 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2073 dev_lock();
2074 vp->v_rdev->si_usecount--;
2075 dev_unlock();
2076 }
2077 vdropl(vp);
2078 }
2079
2080 /*
2081 * Decrement only the use count and driver use count. This is intended to
2082 * be paired with a follow on vdropl() to release the remaining hold count.
2083 * In this way we may vgone() a vnode with a 0 usecount without risk of
2084 * having it end up on a free list because the hold count is kept above 0.
2085 */
2086 static void
2087 v_decr_useonly(struct vnode *vp)
2088 {
2089
2090 ASSERT_VI_LOCKED(vp, __FUNCTION__);
2091 VNASSERT(vp->v_usecount > 0, vp,
2092 ("v_decr_useonly: negative usecount"));
2093 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2094 vp->v_usecount--;
2095 if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2096 dev_lock();
2097 vp->v_rdev->si_usecount--;
2098 dev_unlock();
2099 }
2100 }
2101
2102 /*
2103 * Grab a particular vnode from the free list, increment its
2104 * reference count and lock it. VI_DOOMED is set if the vnode
2105 * is being destroyed. Only callers who specify LK_RETRY will
2106 * see doomed vnodes. If inactive processing was delayed in
2107 * vput try to do it here.
2108 */
2109 int
2110 vget(struct vnode *vp, int flags, struct thread *td)
2111 {
2112 int error;
2113
2114 error = 0;
2115 VFS_ASSERT_GIANT(vp->v_mount);
2116 VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2117 ("vget: invalid lock operation"));
2118 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2119
2120 if ((flags & LK_INTERLOCK) == 0)
2121 VI_LOCK(vp);
2122 vholdl(vp);
2123 if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2124 vdrop(vp);
2125 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2126 vp);
2127 return (error);
2128 }
2129 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2130 panic("vget: vn_lock failed to return ENOENT\n");
2131 VI_LOCK(vp);
2132 /* Upgrade our holdcnt to a usecount. */
2133 v_upgrade_usecount(vp);
2134 /*
2135 * We don't guarantee that any particular close will
2136 * trigger inactive processing so just make a best effort
2137 * here at preventing a reference to a removed file. If
2138 * we don't succeed no harm is done.
2139 */
2140 if (vp->v_iflag & VI_OWEINACT) {
2141 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2142 (flags & LK_NOWAIT) == 0)
2143 vinactive(vp, td);
2144 vp->v_iflag &= ~VI_OWEINACT;
2145 }
2146 VI_UNLOCK(vp);
2147 return (0);
2148 }
2149
2150 /*
2151 * Increase the reference count of a vnode.
2152 */
2153 void
2154 vref(struct vnode *vp)
2155 {
2156
2157 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2158 VI_LOCK(vp);
2159 v_incr_usecount(vp);
2160 VI_UNLOCK(vp);
2161 }
2162
2163 /*
2164 * Return reference count of a vnode.
2165 *
2166 * The results of this call are only guaranteed when some mechanism other
2167 * than the VI lock is used to stop other processes from gaining references
2168 * to the vnode. This may be the case if the caller holds the only reference.
2169 * This is also useful when stale data is acceptable as race conditions may
2170 * be accounted for by some other means.
2171 */
2172 int
2173 vrefcnt(struct vnode *vp)
2174 {
2175 int usecnt;
2176
2177 VI_LOCK(vp);
2178 usecnt = vp->v_usecount;
2179 VI_UNLOCK(vp);
2180
2181 return (usecnt);
2182 }
2183
2184 #define VPUTX_VRELE 1
2185 #define VPUTX_VPUT 2
2186 #define VPUTX_VUNREF 3
2187
2188 static void
2189 vputx(struct vnode *vp, int func)
2190 {
2191 int error;
2192
2193 KASSERT(vp != NULL, ("vputx: null vp"));
2194 if (func == VPUTX_VUNREF)
2195 ASSERT_VOP_LOCKED(vp, "vunref");
2196 else if (func == VPUTX_VPUT)
2197 ASSERT_VOP_LOCKED(vp, "vput");
2198 else
2199 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2200 VFS_ASSERT_GIANT(vp->v_mount);
2201 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2202 VI_LOCK(vp);
2203
2204 /* Skip this v_writecount check if we're going to panic below. */
2205 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2206 ("vputx: missed vn_close"));
2207 error = 0;
2208
2209 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2210 vp->v_usecount == 1)) {
2211 if (func == VPUTX_VPUT)
2212 VOP_UNLOCK(vp, 0);
2213 v_decr_usecount(vp);
2214 return;
2215 }
2216
2217 if (vp->v_usecount != 1) {
2218 vprint("vputx: negative ref count", vp);
2219 panic("vputx: negative ref cnt");
2220 }
2221 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2222 /*
2223 * We want to hold the vnode until the inactive finishes to
2224 * prevent vgone() races. We drop the use count here and the
2225 * hold count below when we're done.
2226 */
2227 v_decr_useonly(vp);
2228 /*
2229 * We must call VOP_INACTIVE with the node locked. Mark
2230 * as VI_DOINGINACT to avoid recursion.
2231 */
2232 vp->v_iflag |= VI_OWEINACT;
2233 switch (func) {
2234 case VPUTX_VRELE:
2235 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2236 VI_LOCK(vp);
2237 break;
2238 case VPUTX_VPUT:
2239 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2240 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2241 LK_NOWAIT);
2242 VI_LOCK(vp);
2243 }
2244 break;
2245 case VPUTX_VUNREF:
2246 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
2247 error = EBUSY;
2248 break;
2249 }
2250 if (vp->v_usecount > 0)
2251 vp->v_iflag &= ~VI_OWEINACT;
2252 if (error == 0) {
2253 if (vp->v_iflag & VI_OWEINACT)
2254 vinactive(vp, curthread);
2255 if (func != VPUTX_VUNREF)
2256 VOP_UNLOCK(vp, 0);
2257 }
2258 vdropl(vp);
2259 }
2260
2261 /*
2262 * Vnode put/release.
2263 * If count drops to zero, call inactive routine and return to freelist.
2264 */
2265 void
2266 vrele(struct vnode *vp)
2267 {
2268
2269 vputx(vp, VPUTX_VRELE);
2270 }
2271
2272 /*
2273 * Release an already locked vnode. This give the same effects as
2274 * unlock+vrele(), but takes less time and avoids releasing and
2275 * re-aquiring the lock (as vrele() acquires the lock internally.)
2276 */
2277 void
2278 vput(struct vnode *vp)
2279 {
2280
2281 vputx(vp, VPUTX_VPUT);
2282 }
2283
2284 /*
2285 * Release an exclusively locked vnode. Do not unlock the vnode lock.
2286 */
2287 void
2288 vunref(struct vnode *vp)
2289 {
2290
2291 vputx(vp, VPUTX_VUNREF);
2292 }
2293
2294 /*
2295 * Somebody doesn't want the vnode recycled.
2296 */
2297 void
2298 vhold(struct vnode *vp)
2299 {
2300
2301 VI_LOCK(vp);
2302 vholdl(vp);
2303 VI_UNLOCK(vp);
2304 }
2305
2306 /*
2307 * Increase the hold count and activate if this is the first reference.
2308 */
2309 void
2310 vholdl(struct vnode *vp)
2311 {
2312
2313 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2314 vp->v_holdcnt++;
2315 if (!VSHOULDBUSY(vp))
2316 return;
2317 ASSERT_VI_LOCKED(vp, "vholdl");
2318 VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
2319 VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2320 /*
2321 * Remove a vnode from the free list and mark it as in use.
2322 */
2323 mtx_lock(&vnode_free_list_mtx);
2324 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2325 freevnodes--;
2326 vp->v_iflag &= ~(VI_FREE|VI_AGE);
2327 mtx_unlock(&vnode_free_list_mtx);
2328 }
2329
2330 /*
2331 * Note that there is one less who cares about this vnode.
2332 * vdrop() is the opposite of vhold().
2333 */
2334 void
2335 vdrop(struct vnode *vp)
2336 {
2337
2338 VI_LOCK(vp);
2339 vdropl(vp);
2340 }
2341
2342 /*
2343 * Drop the hold count of the vnode. If this is the last reference to
2344 * the vnode we place it on the free list unless it has been vgone'd
2345 * (marked VI_DOOMED) in which case we will free it.
2346 */
2347 void
2348 vdropl(struct vnode *vp)
2349 {
2350 struct bufobj *bo;
2351
2352 ASSERT_VI_LOCKED(vp, "vdropl");
2353 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2354 if (vp->v_holdcnt <= 0)
2355 panic("vdrop: holdcnt %d", vp->v_holdcnt);
2356 vp->v_holdcnt--;
2357 if (vp->v_holdcnt > 0) {
2358 VI_UNLOCK(vp);
2359 return;
2360 }
2361 if ((vp->v_iflag & VI_DOOMED) == 0) {
2362 /*
2363 * Mark a vnode as free, putting it up for recycling.
2364 */
2365 mtx_lock(&vnode_free_list_mtx);
2366 VNASSERT(vp->v_op != NULL, vp,
2367 ("vdropl: vnode already reclaimed."));
2368 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2369 ("vnode already free"));
2370 VNASSERT(VSHOULDFREE(vp), vp,
2371 ("vdropl: freeing when we shouldn't"));
2372 VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
2373 ("vdropl: Freeing doomed vnode"));
2374 if (vp->v_iflag & VI_AGE) {
2375 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2376 } else {
2377 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2378 }
2379 freevnodes++;
2380 vp->v_iflag &= ~VI_AGE;
2381 vp->v_iflag |= VI_FREE;
2382 mtx_unlock(&vnode_free_list_mtx);
2383 VI_UNLOCK(vp);
2384 return;
2385 }
2386 /*
2387 * The vnode has been marked for destruction, so free it.
2388 */
2389 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2390 mtx_lock(&vnode_free_list_mtx);
2391 numvnodes--;
2392 mtx_unlock(&vnode_free_list_mtx);
2393 bo = &vp->v_bufobj;
2394 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2395 ("cleaned vnode still on the free list."));
2396 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2397 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2398 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2399 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2400 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2401 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2402 VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
2403 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2404 VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
2405 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2406 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2407 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2408 VI_UNLOCK(vp);
2409 #ifdef MAC
2410 mac_vnode_destroy(vp);
2411 #endif
2412 if (vp->v_pollinfo != NULL)
2413 destroy_vpollinfo(vp->v_pollinfo);
2414 #ifdef INVARIANTS
2415 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2416 vp->v_op = NULL;
2417 #endif
2418 lockdestroy(vp->v_vnlock);
2419 mtx_destroy(&vp->v_interlock);
2420 mtx_destroy(BO_MTX(bo));
2421 uma_zfree(vnode_zone, vp);
2422 }
2423
2424 /*
2425 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2426 * flags. DOINGINACT prevents us from recursing in calls to vinactive.
2427 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2428 * failed lock upgrade.
2429 */
2430 void
2431 vinactive(struct vnode *vp, struct thread *td)
2432 {
2433
2434 ASSERT_VOP_ELOCKED(vp, "vinactive");
2435 ASSERT_VI_LOCKED(vp, "vinactive");
2436 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2437 ("vinactive: recursed on VI_DOINGINACT"));
2438 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2439 vp->v_iflag |= VI_DOINGINACT;
2440 vp->v_iflag &= ~VI_OWEINACT;
2441 VI_UNLOCK(vp);
2442 VOP_INACTIVE(vp, td);
2443 VI_LOCK(vp);
2444 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2445 ("vinactive: lost VI_DOINGINACT"));
2446 vp->v_iflag &= ~VI_DOINGINACT;
2447 }
2448
2449 /*
2450 * Remove any vnodes in the vnode table belonging to mount point mp.
2451 *
2452 * If FORCECLOSE is not specified, there should not be any active ones,
2453 * return error if any are found (nb: this is a user error, not a
2454 * system error). If FORCECLOSE is specified, detach any active vnodes
2455 * that are found.
2456 *
2457 * If WRITECLOSE is set, only flush out regular file vnodes open for
2458 * writing.
2459 *
2460 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2461 *
2462 * `rootrefs' specifies the base reference count for the root vnode
2463 * of this filesystem. The root vnode is considered busy if its
2464 * v_usecount exceeds this value. On a successful return, vflush(, td)
2465 * will call vrele() on the root vnode exactly rootrefs times.
2466 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2467 * be zero.
2468 */
2469 #ifdef DIAGNOSTIC
2470 static int busyprt = 0; /* print out busy vnodes */
2471 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2472 #endif
2473
2474 int
2475 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2476 {
2477 struct vnode *vp, *mvp, *rootvp = NULL;
2478 struct vattr vattr;
2479 int busy = 0, error;
2480
2481 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2482 rootrefs, flags);
2483 if (rootrefs > 0) {
2484 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2485 ("vflush: bad args"));
2486 /*
2487 * Get the filesystem root vnode. We can vput() it
2488 * immediately, since with rootrefs > 0, it won't go away.
2489 */
2490 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2491 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2492 __func__, error);
2493 return (error);
2494 }
2495 vput(rootvp);
2496 }
2497 loop:
2498 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2499 vholdl(vp);
2500 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2501 if (error) {
2502 vdrop(vp);
2503 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2504 goto loop;
2505 }
2506 /*
2507 * Skip over a vnodes marked VV_SYSTEM.
2508 */
2509 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2510 VOP_UNLOCK(vp, 0);
2511 vdrop(vp);
2512 continue;
2513 }
2514 /*
2515 * If WRITECLOSE is set, flush out unlinked but still open
2516 * files (even if open only for reading) and regular file
2517 * vnodes open for writing.
2518 */
2519 if (flags & WRITECLOSE) {
2520 if (vp->v_object != NULL) {
2521 VM_OBJECT_LOCK(vp->v_object);
2522 vm_object_page_clean(vp->v_object, 0, 0, 0);
2523 VM_OBJECT_UNLOCK(vp->v_object);
2524 }
2525 error = VOP_FSYNC(vp, MNT_WAIT, td);
2526 if (error != 0) {
2527 VOP_UNLOCK(vp, 0);
2528 vdrop(vp);
2529 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2530 return (error);
2531 }
2532 error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2533 VI_LOCK(vp);
2534
2535 if ((vp->v_type == VNON ||
2536 (error == 0 && vattr.va_nlink > 0)) &&
2537 (vp->v_writecount == 0 || vp->v_type != VREG)) {
2538 VOP_UNLOCK(vp, 0);
2539 vdropl(vp);
2540 continue;
2541 }
2542 } else
2543 VI_LOCK(vp);
2544 /*
2545 * With v_usecount == 0, all we need to do is clear out the
2546 * vnode data structures and we are done.
2547 *
2548 * If FORCECLOSE is set, forcibly close the vnode.
2549 */
2550 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2551 VNASSERT(vp->v_usecount == 0 ||
2552 (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2553 ("device VNODE %p is FORCECLOSED", vp));
2554 vgonel(vp);
2555 } else {
2556 busy++;
2557 #ifdef DIAGNOSTIC
2558 if (busyprt)
2559 vprint("vflush: busy vnode", vp);
2560 #endif
2561 }
2562 VOP_UNLOCK(vp, 0);
2563 vdropl(vp);
2564 }
2565 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2566 /*
2567 * If just the root vnode is busy, and if its refcount
2568 * is equal to `rootrefs', then go ahead and kill it.
2569 */
2570 VI_LOCK(rootvp);
2571 KASSERT(busy > 0, ("vflush: not busy"));
2572 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2573 ("vflush: usecount %d < rootrefs %d",
2574 rootvp->v_usecount, rootrefs));
2575 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2576 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2577 vgone(rootvp);
2578 VOP_UNLOCK(rootvp, 0);
2579 busy = 0;
2580 } else
2581 VI_UNLOCK(rootvp);
2582 }
2583 if (busy) {
2584 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2585 busy);
2586 return (EBUSY);
2587 }
2588 for (; rootrefs > 0; rootrefs--)
2589 vrele(rootvp);
2590 return (0);
2591 }
2592
2593 /*
2594 * Recycle an unused vnode to the front of the free list.
2595 */
2596 int
2597 vrecycle(struct vnode *vp, struct thread *td)
2598 {
2599 int recycled;
2600
2601 ASSERT_VOP_ELOCKED(vp, "vrecycle");
2602 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2603 recycled = 0;
2604 VI_LOCK(vp);
2605 if (vp->v_usecount == 0) {
2606 recycled = 1;
2607 vgonel(vp);
2608 }
2609 VI_UNLOCK(vp);
2610 return (recycled);
2611 }
2612
2613 /*
2614 * Eliminate all activity associated with a vnode
2615 * in preparation for reuse.
2616 */
2617 void
2618 vgone(struct vnode *vp)
2619 {
2620 VI_LOCK(vp);
2621 vgonel(vp);
2622 VI_UNLOCK(vp);
2623 }
2624
2625 /*
2626 * vgone, with the vp interlock held.
2627 */
2628 void
2629 vgonel(struct vnode *vp)
2630 {
2631 struct thread *td;
2632 int oweinact;
2633 int active;
2634 struct mount *mp;
2635
2636 ASSERT_VOP_ELOCKED(vp, "vgonel");
2637 ASSERT_VI_LOCKED(vp, "vgonel");
2638 VNASSERT(vp->v_holdcnt, vp,
2639 ("vgonel: vp %p has no reference.", vp));
2640 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2641 td = curthread;
2642
2643 /*
2644 * Don't vgonel if we're already doomed.
2645 */
2646 if (vp->v_iflag & VI_DOOMED)
2647 return;
2648 vp->v_iflag |= VI_DOOMED;
2649 /*
2650 * Check to see if the vnode is in use. If so, we have to call
2651 * VOP_CLOSE() and VOP_INACTIVE().
2652 */
2653 active = vp->v_usecount;
2654 oweinact = (vp->v_iflag & VI_OWEINACT);
2655 VI_UNLOCK(vp);
2656 /*
2657 * Clean out any buffers associated with the vnode.
2658 * If the flush fails, just toss the buffers.
2659 */
2660 mp = NULL;
2661 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2662 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
2663 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2664 vinvalbuf(vp, 0, 0, 0);
2665
2666 /*
2667 * If purging an active vnode, it must be closed and
2668 * deactivated before being reclaimed.
2669 */
2670 if (active)
2671 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2672 if (oweinact || active) {
2673 VI_LOCK(vp);
2674 if ((vp->v_iflag & VI_DOINGINACT) == 0)
2675 vinactive(vp, td);
2676 VI_UNLOCK(vp);
2677 }
2678 if (vp->v_type == VSOCK)
2679 vfs_unp_reclaim(vp);
2680 /*
2681 * Reclaim the vnode.
2682 */
2683 if (VOP_RECLAIM(vp, td))
2684 panic("vgone: cannot reclaim");
2685 if (mp != NULL)
2686 vn_finished_secondary_write(mp);
2687 VNASSERT(vp->v_object == NULL, vp,
2688 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2689 /*
2690 * Clear the advisory locks and wake up waiting threads.
2691 */
2692 (void)VOP_ADVLOCKPURGE(vp);
2693 /*
2694 * Delete from old mount point vnode list.
2695 */
2696 delmntque(vp);
2697 cache_purge(vp);
2698 /*
2699 * Done with purge, reset to the standard lock and invalidate
2700 * the vnode.
2701 */
2702 VI_LOCK(vp);
2703 vp->v_vnlock = &vp->v_lock;
2704 vp->v_op = &dead_vnodeops;
2705 vp->v_tag = "none";
2706 vp->v_type = VBAD;
2707 }
2708
2709 /*
2710 * Calculate the total number of references to a special device.
2711 */
2712 int
2713 vcount(struct vnode *vp)
2714 {
2715 int count;
2716
2717 dev_lock();
2718 count = vp->v_rdev->si_usecount;
2719 dev_unlock();
2720 return (count);
2721 }
2722
2723 /*
2724 * Same as above, but using the struct cdev *as argument
2725 */
2726 int
2727 count_dev(struct cdev *dev)
2728 {
2729 int count;
2730
2731 dev_lock();
2732 count = dev->si_usecount;
2733 dev_unlock();
2734 return(count);
2735 }
2736
2737 /*
2738 * Print out a description of a vnode.
2739 */
2740 static char *typename[] =
2741 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2742 "VMARKER"};
2743
2744 void
2745 vn_printf(struct vnode *vp, const char *fmt, ...)
2746 {
2747 va_list ap;
2748 char buf[256], buf2[16];
2749 u_long flags;
2750
2751 va_start(ap, fmt);
2752 vprintf(fmt, ap);
2753 va_end(ap);
2754 printf("%p: ", (void *)vp);
2755 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2756 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n",
2757 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2758 buf[0] = '\0';
2759 buf[1] = '\0';
2760 if (vp->v_vflag & VV_ROOT)
2761 strlcat(buf, "|VV_ROOT", sizeof(buf));
2762 if (vp->v_vflag & VV_ISTTY)
2763 strlcat(buf, "|VV_ISTTY", sizeof(buf));
2764 if (vp->v_vflag & VV_NOSYNC)
2765 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2766 if (vp->v_vflag & VV_CACHEDLABEL)
2767 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2768 if (vp->v_vflag & VV_TEXT)
2769 strlcat(buf, "|VV_TEXT", sizeof(buf));
2770 if (vp->v_vflag & VV_COPYONWRITE)
2771 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2772 if (vp->v_vflag & VV_SYSTEM)
2773 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2774 if (vp->v_vflag & VV_PROCDEP)
2775 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2776 if (vp->v_vflag & VV_NOKNOTE)
2777 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2778 if (vp->v_vflag & VV_DELETED)
2779 strlcat(buf, "|VV_DELETED", sizeof(buf));
2780 if (vp->v_vflag & VV_MD)
2781 strlcat(buf, "|VV_MD", sizeof(buf));
2782 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
2783 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2784 VV_NOKNOTE | VV_DELETED | VV_MD);
2785 if (flags != 0) {
2786 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2787 strlcat(buf, buf2, sizeof(buf));
2788 }
2789 if (vp->v_iflag & VI_MOUNT)
2790 strlcat(buf, "|VI_MOUNT", sizeof(buf));
2791 if (vp->v_iflag & VI_AGE)
2792 strlcat(buf, "|VI_AGE", sizeof(buf));
2793 if (vp->v_iflag & VI_DOOMED)
2794 strlcat(buf, "|VI_DOOMED", sizeof(buf));
2795 if (vp->v_iflag & VI_FREE)
2796 strlcat(buf, "|VI_FREE", sizeof(buf));
2797 if (vp->v_iflag & VI_DOINGINACT)
2798 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2799 if (vp->v_iflag & VI_OWEINACT)
2800 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2801 flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2802 VI_DOINGINACT | VI_OWEINACT);
2803 if (flags != 0) {
2804 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2805 strlcat(buf, buf2, sizeof(buf));
2806 }
2807 printf(" flags (%s)\n", buf + 1);
2808 if (mtx_owned(VI_MTX(vp)))
2809 printf(" VI_LOCKed");
2810 if (vp->v_object != NULL)
2811 printf(" v_object %p ref %d pages %d\n",
2812 vp->v_object, vp->v_object->ref_count,
2813 vp->v_object->resident_page_count);
2814 printf(" ");
2815 lockmgr_printinfo(vp->v_vnlock);
2816 if (vp->v_data != NULL)
2817 VOP_PRINT(vp);
2818 }
2819
2820 #ifdef DDB
2821 /*
2822 * List all of the locked vnodes in the system.
2823 * Called when debugging the kernel.
2824 */
2825 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2826 {
2827 struct mount *mp, *nmp;
2828 struct vnode *vp;
2829
2830 /*
2831 * Note: because this is DDB, we can't obey the locking semantics
2832 * for these structures, which means we could catch an inconsistent
2833 * state and dereference a nasty pointer. Not much to be done
2834 * about that.
2835 */
2836 db_printf("Locked vnodes\n");
2837 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2838 nmp = TAILQ_NEXT(mp, mnt_list);
2839 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2840 if (vp->v_type != VMARKER &&
2841 VOP_ISLOCKED(vp))
2842 vprint("", vp);
2843 }
2844 nmp = TAILQ_NEXT(mp, mnt_list);
2845 }
2846 }
2847
2848 /*
2849 * Show details about the given vnode.
2850 */
2851 DB_SHOW_COMMAND(vnode, db_show_vnode)
2852 {
2853 struct vnode *vp;
2854
2855 if (!have_addr)
2856 return;
2857 vp = (struct vnode *)addr;
2858 vn_printf(vp, "vnode ");
2859 }
2860
2861 /*
2862 * Show details about the given mount point.
2863 */
2864 DB_SHOW_COMMAND(mount, db_show_mount)
2865 {
2866 struct mount *mp;
2867 struct vfsopt *opt;
2868 struct statfs *sp;
2869 struct vnode *vp;
2870 char buf[512];
2871 uint64_t mflags;
2872 u_int flags;
2873
2874 if (!have_addr) {
2875 /* No address given, print short info about all mount points. */
2876 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2877 db_printf("%p %s on %s (%s)\n", mp,
2878 mp->mnt_stat.f_mntfromname,
2879 mp->mnt_stat.f_mntonname,
2880 mp->mnt_stat.f_fstypename);
2881 if (db_pager_quit)
2882 break;
2883 }
2884 db_printf("\nMore info: show mount <addr>\n");
2885 return;
2886 }
2887
2888 mp = (struct mount *)addr;
2889 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
2890 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
2891
2892 buf[0] = '\0';
2893 mflags = mp->mnt_flag;
2894 #define MNT_FLAG(flag) do { \
2895 if (mflags & (flag)) { \
2896 if (buf[0] != '\0') \
2897 strlcat(buf, ", ", sizeof(buf)); \
2898 strlcat(buf, (#flag) + 4, sizeof(buf)); \
2899 mflags &= ~(flag); \
2900 } \
2901 } while (0)
2902 MNT_FLAG(MNT_RDONLY);
2903 MNT_FLAG(MNT_SYNCHRONOUS);
2904 MNT_FLAG(MNT_NOEXEC);
2905 MNT_FLAG(MNT_NOSUID);
2906 MNT_FLAG(MNT_UNION);
2907 MNT_FLAG(MNT_ASYNC);
2908 MNT_FLAG(MNT_SUIDDIR);
2909 MNT_FLAG(MNT_SOFTDEP);
2910 MNT_FLAG(MNT_SUJ);
2911 MNT_FLAG(MNT_NOSYMFOLLOW);
2912 MNT_FLAG(MNT_GJOURNAL);
2913 MNT_FLAG(MNT_MULTILABEL);
2914 MNT_FLAG(MNT_ACLS);
2915 MNT_FLAG(MNT_NOATIME);
2916 MNT_FLAG(MNT_NOCLUSTERR);
2917 MNT_FLAG(MNT_NOCLUSTERW);
2918 MNT_FLAG(MNT_NFS4ACLS);
2919 MNT_FLAG(MNT_EXRDONLY);
2920 MNT_FLAG(MNT_EXPORTED);
2921 MNT_FLAG(MNT_DEFEXPORTED);
2922 MNT_FLAG(MNT_EXPORTANON);
2923 MNT_FLAG(MNT_EXKERB);
2924 MNT_FLAG(MNT_EXPUBLIC);
2925 MNT_FLAG(MNT_LOCAL);
2926 MNT_FLAG(MNT_QUOTA);
2927 MNT_FLAG(MNT_ROOTFS);
2928 MNT_FLAG(MNT_USER);
2929 MNT_FLAG(MNT_IGNORE);
2930 MNT_FLAG(MNT_UPDATE);
2931 MNT_FLAG(MNT_DELEXPORT);
2932 MNT_FLAG(MNT_RELOAD);
2933 MNT_FLAG(MNT_FORCE);
2934 MNT_FLAG(MNT_SNAPSHOT);
2935 MNT_FLAG(MNT_BYFSID);
2936 #undef MNT_FLAG
2937 if (mflags != 0) {
2938 if (buf[0] != '\0')
2939 strlcat(buf, ", ", sizeof(buf));
2940 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2941 "0x%016jx", mflags);
2942 }
2943 db_printf(" mnt_flag = %s\n", buf);
2944
2945 buf[0] = '\0';
2946 flags = mp->mnt_kern_flag;
2947 #define MNT_KERN_FLAG(flag) do { \
2948 if (flags & (flag)) { \
2949 if (buf[0] != '\0') \
2950 strlcat(buf, ", ", sizeof(buf)); \
2951 strlcat(buf, (#flag) + 5, sizeof(buf)); \
2952 flags &= ~(flag); \
2953 } \
2954 } while (0)
2955 MNT_KERN_FLAG(MNTK_UNMOUNTF);
2956 MNT_KERN_FLAG(MNTK_ASYNC);
2957 MNT_KERN_FLAG(MNTK_SOFTDEP);
2958 MNT_KERN_FLAG(MNTK_NOINSMNTQ);
2959 MNT_KERN_FLAG(MNTK_DRAINING);
2960 MNT_KERN_FLAG(MNTK_REFEXPIRE);
2961 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
2962 MNT_KERN_FLAG(MNTK_SHARED_WRITES);
2963 MNT_KERN_FLAG(MNTK_NOASYNC);
2964 MNT_KERN_FLAG(MNTK_UNMOUNT);
2965 MNT_KERN_FLAG(MNTK_MWAIT);
2966 MNT_KERN_FLAG(MNTK_SUSPEND);
2967 MNT_KERN_FLAG(MNTK_SUSPEND2);
2968 MNT_KERN_FLAG(MNTK_SUSPENDED);
2969 MNT_KERN_FLAG(MNTK_MPSAFE);
2970 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
2971 MNT_KERN_FLAG(MNTK_NOKNOTE);
2972 #undef MNT_KERN_FLAG
2973 if (flags != 0) {
2974 if (buf[0] != '\0')
2975 strlcat(buf, ", ", sizeof(buf));
2976 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2977 "0x%08x", flags);
2978 }
2979 db_printf(" mnt_kern_flag = %s\n", buf);
2980
2981 db_printf(" mnt_opt = ");
2982 opt = TAILQ_FIRST(mp->mnt_opt);
2983 if (opt != NULL) {
2984 db_printf("%s", opt->name);
2985 opt = TAILQ_NEXT(opt, link);
2986 while (opt != NULL) {
2987 db_printf(", %s", opt->name);
2988 opt = TAILQ_NEXT(opt, link);
2989 }
2990 }
2991 db_printf("\n");
2992
2993 sp = &mp->mnt_stat;
2994 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
2995 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
2996 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
2997 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
2998 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
2999 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3000 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3001 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3002 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3003 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3004 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3005 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3006
3007 db_printf(" mnt_cred = { uid=%u ruid=%u",
3008 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3009 if (jailed(mp->mnt_cred))
3010 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3011 db_printf(" }\n");
3012 db_printf(" mnt_ref = %d\n", mp->mnt_ref);
3013 db_printf(" mnt_gen = %d\n", mp->mnt_gen);
3014 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3015 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3016 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3017 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3018 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
3019 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3020 db_printf(" mnt_secondary_accwrites = %d\n",
3021 mp->mnt_secondary_accwrites);
3022 db_printf(" mnt_gjprovider = %s\n",
3023 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3024 db_printf("\n");
3025
3026 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3027 if (vp->v_type != VMARKER) {
3028 vn_printf(vp, "vnode ");
3029 if (db_pager_quit)
3030 break;
3031 }
3032 }
3033 }
3034 #endif /* DDB */
3035
3036 /*
3037 * Fill in a struct xvfsconf based on a struct vfsconf.
3038 */
3039 static void
3040 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
3041 {
3042
3043 strcpy(xvfsp->vfc_name, vfsp->vfc_name);
3044 xvfsp->vfc_typenum = vfsp->vfc_typenum;
3045 xvfsp->vfc_refcount = vfsp->vfc_refcount;
3046 xvfsp->vfc_flags = vfsp->vfc_flags;
3047 /*
3048 * These are unused in userland, we keep them
3049 * to not break binary compatibility.
3050 */
3051 xvfsp->vfc_vfsops = NULL;
3052 xvfsp->vfc_next = NULL;
3053 }
3054
3055 /*
3056 * Top level filesystem related information gathering.
3057 */
3058 static int
3059 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3060 {
3061 struct vfsconf *vfsp;
3062 struct xvfsconf xvfsp;
3063 int error;
3064
3065 error = 0;
3066 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3067 bzero(&xvfsp, sizeof(xvfsp));
3068 vfsconf2x(vfsp, &xvfsp);
3069 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
3070 if (error)
3071 break;
3072 }
3073 return (error);
3074 }
3075
3076 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
3077 NULL, 0, sysctl_vfs_conflist,
3078 "S,xvfsconf", "List of all configured filesystems");
3079
3080 #ifndef BURN_BRIDGES
3081 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3082
3083 static int
3084 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3085 {
3086 int *name = (int *)arg1 - 1; /* XXX */
3087 u_int namelen = arg2 + 1; /* XXX */
3088 struct vfsconf *vfsp;
3089 struct xvfsconf xvfsp;
3090
3091 log(LOG_WARNING, "userland calling deprecated sysctl, "
3092 "please rebuild world\n");
3093
3094 #if 1 || defined(COMPAT_PRELITE2)
3095 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3096 if (namelen == 1)
3097 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3098 #endif
3099
3100 switch (name[1]) {
3101 case VFS_MAXTYPENUM:
3102 if (namelen != 2)
3103 return (ENOTDIR);
3104 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3105 case VFS_CONF:
3106 if (namelen != 3)
3107 return (ENOTDIR); /* overloaded */
3108 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
3109 if (vfsp->vfc_typenum == name[2])
3110 break;
3111 if (vfsp == NULL)
3112 return (EOPNOTSUPP);
3113 bzero(&xvfsp, sizeof(xvfsp));
3114 vfsconf2x(vfsp, &xvfsp);
3115 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3116 }
3117 return (EOPNOTSUPP);
3118 }
3119
3120 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
3121 vfs_sysctl, "Generic filesystem");
3122
3123 #if 1 || defined(COMPAT_PRELITE2)
3124
3125 static int
3126 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3127 {
3128 int error;
3129 struct vfsconf *vfsp;
3130 struct ovfsconf ovfs;
3131
3132 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3133 bzero(&ovfs, sizeof(ovfs));
3134 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
3135 strcpy(ovfs.vfc_name, vfsp->vfc_name);
3136 ovfs.vfc_index = vfsp->vfc_typenum;
3137 ovfs.vfc_refcount = vfsp->vfc_refcount;
3138 ovfs.vfc_flags = vfsp->vfc_flags;
3139 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3140 if (error)
3141 return error;
3142 }
3143 return 0;
3144 }
3145
3146 #endif /* 1 || COMPAT_PRELITE2 */
3147 #endif /* !BURN_BRIDGES */
3148
3149 #define KINFO_VNODESLOP 10
3150 #ifdef notyet
3151 /*
3152 * Dump vnode list (via sysctl).
3153 */
3154 /* ARGSUSED */
3155 static int
3156 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3157 {
3158 struct xvnode *xvn;
3159 struct mount *mp;
3160 struct vnode *vp;
3161 int error, len, n;
3162
3163 /*
3164 * Stale numvnodes access is not fatal here.
3165 */
3166 req->lock = 0;
3167 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3168 if (!req->oldptr)
3169 /* Make an estimate */
3170 return (SYSCTL_OUT(req, 0, len));
3171
3172 error = sysctl_wire_old_buffer(req, 0);
3173 if (error != 0)
3174 return (error);
3175 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3176 n = 0;
3177 mtx_lock(&mountlist_mtx);
3178 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3179 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3180 continue;
3181 MNT_ILOCK(mp);
3182 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3183 if (n == len)
3184 break;
3185 vref(vp);
3186 xvn[n].xv_size = sizeof *xvn;
3187 xvn[n].xv_vnode = vp;
3188 xvn[n].xv_id = 0; /* XXX compat */
3189 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3190 XV_COPY(usecount);
3191 XV_COPY(writecount);
3192 XV_COPY(holdcnt);
3193 XV_COPY(mount);
3194 XV_COPY(numoutput);
3195 XV_COPY(type);
3196 #undef XV_COPY
3197 xvn[n].xv_flag = vp->v_vflag;
3198
3199 switch (vp->v_type) {
3200 case VREG:
3201 case VDIR:
3202 case VLNK:
3203 break;
3204 case VBLK:
3205 case VCHR:
3206 if (vp->v_rdev == NULL) {
3207 vrele(vp);
3208 continue;
3209 }
3210 xvn[n].xv_dev = dev2udev(vp->v_rdev);
3211 break;
3212 case VSOCK:
3213 xvn[n].xv_socket = vp->v_socket;
3214 break;
3215 case VFIFO:
3216 xvn[n].xv_fifo = vp->v_fifoinfo;
3217 break;
3218 case VNON:
3219 case VBAD:
3220 default:
3221 /* shouldn't happen? */
3222 vrele(vp);
3223 continue;
3224 }
3225 vrele(vp);
3226 ++n;
3227 }
3228 MNT_IUNLOCK(mp);
3229 mtx_lock(&mountlist_mtx);
3230 vfs_unbusy(mp);
3231 if (n == len)
3232 break;
3233 }
3234 mtx_unlock(&mountlist_mtx);
3235
3236 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3237 free(xvn, M_TEMP);
3238 return (error);
3239 }
3240
3241 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3242 0, 0, sysctl_vnode, "S,xvnode", "");
3243 #endif
3244
3245 /*
3246 * Unmount all filesystems. The list is traversed in reverse order
3247 * of mounting to avoid dependencies.
3248 */
3249 void
3250 vfs_unmountall(void)
3251 {
3252 struct mount *mp;
3253 struct thread *td;
3254 int error;
3255
3256 KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
3257 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3258 td = curthread;
3259
3260 /*
3261 * Since this only runs when rebooting, it is not interlocked.
3262 */
3263 while(!TAILQ_EMPTY(&mountlist)) {
3264 mp = TAILQ_LAST(&mountlist, mntlist);
3265 error = dounmount(mp, MNT_FORCE, td);
3266 if (error) {
3267 TAILQ_REMOVE(&mountlist, mp, mnt_list);
3268 /*
3269 * XXX: Due to the way in which we mount the root
3270 * file system off of devfs, devfs will generate a
3271 * "busy" warning when we try to unmount it before
3272 * the root. Don't print a warning as a result in
3273 * order to avoid false positive errors that may
3274 * cause needless upset.
3275 */
3276 if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3277 printf("unmount of %s failed (",
3278 mp->mnt_stat.f_mntonname);
3279 if (error == EBUSY)
3280 printf("BUSY)\n");
3281 else
3282 printf("%d)\n", error);
3283 }
3284 } else {
3285 /* The unmount has removed mp from the mountlist */
3286 }
3287 }
3288 }
3289
3290 /*
3291 * perform msync on all vnodes under a mount point
3292 * the mount point must be locked.
3293 */
3294 void
3295 vfs_msync(struct mount *mp, int flags)
3296 {
3297 struct vnode *vp, *mvp;
3298 struct vm_object *obj;
3299
3300 CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3301 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3302 obj = vp->v_object;
3303 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3304 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3305 if (!vget(vp,
3306 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3307 curthread)) {
3308 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
3309 vput(vp);
3310 continue;
3311 }
3312
3313 obj = vp->v_object;
3314 if (obj != NULL) {
3315 VM_OBJECT_LOCK(obj);
3316 vm_object_page_clean(obj, 0, 0,
3317 flags == MNT_WAIT ?
3318 OBJPC_SYNC : OBJPC_NOSYNC);
3319 VM_OBJECT_UNLOCK(obj);
3320 }
3321 vput(vp);
3322 }
3323 } else
3324 VI_UNLOCK(vp);
3325 }
3326 }
3327
3328 static void
3329 destroy_vpollinfo(struct vpollinfo *vi)
3330 {
3331 seldrain(&vi->vpi_selinfo);
3332 knlist_destroy(&vi->vpi_selinfo.si_note);
3333 mtx_destroy(&vi->vpi_lock);
3334 uma_zfree(vnodepoll_zone, vi);
3335 }
3336
3337 /*
3338 * Initalize per-vnode helper structure to hold poll-related state.
3339 */
3340 void
3341 v_addpollinfo(struct vnode *vp)
3342 {
3343 struct vpollinfo *vi;
3344
3345 if (vp->v_pollinfo != NULL)
3346 return;
3347 vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3348 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3349 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3350 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3351 VI_LOCK(vp);
3352 if (vp->v_pollinfo != NULL) {
3353 VI_UNLOCK(vp);
3354 destroy_vpollinfo(vi);
3355 return;
3356 }
3357 vp->v_pollinfo = vi;
3358 VI_UNLOCK(vp);
3359 }
3360
3361 /*
3362 * Record a process's interest in events which might happen to
3363 * a vnode. Because poll uses the historic select-style interface
3364 * internally, this routine serves as both the ``check for any
3365 * pending events'' and the ``record my interest in future events''
3366 * functions. (These are done together, while the lock is held,
3367 * to avoid race conditions.)
3368 */
3369 int
3370 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3371 {
3372
3373 v_addpollinfo(vp);
3374 mtx_lock(&vp->v_pollinfo->vpi_lock);
3375 if (vp->v_pollinfo->vpi_revents & events) {
3376 /*
3377 * This leaves events we are not interested
3378 * in available for the other process which
3379 * which presumably had requested them
3380 * (otherwise they would never have been
3381 * recorded).
3382 */
3383 events &= vp->v_pollinfo->vpi_revents;
3384 vp->v_pollinfo->vpi_revents &= ~events;
3385
3386 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3387 return (events);
3388 }
3389 vp->v_pollinfo->vpi_events |= events;
3390 selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3391 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3392 return (0);
3393 }
3394
3395 /*
3396 * Routine to create and manage a filesystem syncer vnode.
3397 */
3398 #define sync_close ((int (*)(struct vop_close_args *))nullop)
3399 static int sync_fsync(struct vop_fsync_args *);
3400 static int sync_inactive(struct vop_inactive_args *);
3401 static int sync_reclaim(struct vop_reclaim_args *);
3402
3403 static struct vop_vector sync_vnodeops = {
3404 .vop_bypass = VOP_EOPNOTSUPP,
3405 .vop_close = sync_close, /* close */
3406 .vop_fsync = sync_fsync, /* fsync */
3407 .vop_inactive = sync_inactive, /* inactive */
3408 .vop_reclaim = sync_reclaim, /* reclaim */
3409 .vop_lock1 = vop_stdlock, /* lock */
3410 .vop_unlock = vop_stdunlock, /* unlock */
3411 .vop_islocked = vop_stdislocked, /* islocked */
3412 };
3413
3414 /*
3415 * Create a new filesystem syncer vnode for the specified mount point.
3416 */
3417 void
3418 vfs_allocate_syncvnode(struct mount *mp)
3419 {
3420 struct vnode *vp;
3421 struct bufobj *bo;
3422 static long start, incr, next;
3423 int error;
3424
3425 /* Allocate a new vnode */
3426 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3427 if (error != 0)
3428 panic("vfs_allocate_syncvnode: getnewvnode() failed");
3429 vp->v_type = VNON;
3430 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3431 vp->v_vflag |= VV_FORCEINSMQ;
3432 error = insmntque(vp, mp);
3433 if (error != 0)
3434 panic("vfs_allocate_syncvnode: insmntque() failed");
3435 vp->v_vflag &= ~VV_FORCEINSMQ;
3436 VOP_UNLOCK(vp, 0);
3437 /*
3438 * Place the vnode onto the syncer worklist. We attempt to
3439 * scatter them about on the list so that they will go off
3440 * at evenly distributed times even if all the filesystems
3441 * are mounted at once.
3442 */
3443 next += incr;
3444 if (next == 0 || next > syncer_maxdelay) {
3445 start /= 2;
3446 incr /= 2;
3447 if (start == 0) {
3448 start = syncer_maxdelay / 2;
3449 incr = syncer_maxdelay;
3450 }
3451 next = start;
3452 }
3453 bo = &vp->v_bufobj;
3454 BO_LOCK(bo);
3455 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3456 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3457 mtx_lock(&sync_mtx);
3458 sync_vnode_count++;
3459 if (mp->mnt_syncer == NULL) {
3460 mp->mnt_syncer = vp;
3461 vp = NULL;
3462 }
3463 mtx_unlock(&sync_mtx);
3464 BO_UNLOCK(bo);
3465 if (vp != NULL) {
3466 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3467 vgone(vp);
3468 vput(vp);
3469 }
3470 }
3471
3472 void
3473 vfs_deallocate_syncvnode(struct mount *mp)
3474 {
3475 struct vnode *vp;
3476
3477 mtx_lock(&sync_mtx);
3478 vp = mp->mnt_syncer;
3479 if (vp != NULL)
3480 mp->mnt_syncer = NULL;
3481 mtx_unlock(&sync_mtx);
3482 if (vp != NULL)
3483 vrele(vp);
3484 }
3485
3486 /*
3487 * Do a lazy sync of the filesystem.
3488 */
3489 static int
3490 sync_fsync(struct vop_fsync_args *ap)
3491 {
3492 struct vnode *syncvp = ap->a_vp;
3493 struct mount *mp = syncvp->v_mount;
3494 int error, save;
3495 struct bufobj *bo;
3496
3497 /*
3498 * We only need to do something if this is a lazy evaluation.
3499 */
3500 if (ap->a_waitfor != MNT_LAZY)
3501 return (0);
3502
3503 /*
3504 * Move ourselves to the back of the sync list.
3505 */
3506 bo = &syncvp->v_bufobj;
3507 BO_LOCK(bo);
3508 vn_syncer_add_to_worklist(bo, syncdelay);
3509 BO_UNLOCK(bo);
3510
3511 /*
3512 * Walk the list of vnodes pushing all that are dirty and
3513 * not already on the sync list.
3514 */
3515 mtx_lock(&mountlist_mtx);
3516 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
3517 mtx_unlock(&mountlist_mtx);
3518 return (0);
3519 }
3520 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3521 vfs_unbusy(mp);
3522 return (0);
3523 }
3524 save = curthread_pflags_set(TDP_SYNCIO);
3525 vfs_msync(mp, MNT_NOWAIT);
3526 error = VFS_SYNC(mp, MNT_LAZY);
3527 curthread_pflags_restore(save);
3528 vn_finished_write(mp);
3529 vfs_unbusy(mp);
3530 return (error);
3531 }
3532
3533 /*
3534 * The syncer vnode is no referenced.
3535 */
3536 static int
3537 sync_inactive(struct vop_inactive_args *ap)
3538 {
3539
3540 vgone(ap->a_vp);
3541 return (0);
3542 }
3543
3544 /*
3545 * The syncer vnode is no longer needed and is being decommissioned.
3546 *
3547 * Modifications to the worklist must be protected by sync_mtx.
3548 */
3549 static int
3550 sync_reclaim(struct vop_reclaim_args *ap)
3551 {
3552 struct vnode *vp = ap->a_vp;
3553 struct bufobj *bo;
3554
3555 bo = &vp->v_bufobj;
3556 BO_LOCK(bo);
3557 mtx_lock(&sync_mtx);
3558 if (vp->v_mount->mnt_syncer == vp)
3559 vp->v_mount->mnt_syncer = NULL;
3560 if (bo->bo_flag & BO_ONWORKLST) {
3561 LIST_REMOVE(bo, bo_synclist);
3562 syncer_worklist_len--;
3563 sync_vnode_count--;
3564 bo->bo_flag &= ~BO_ONWORKLST;
3565 }
3566 mtx_unlock(&sync_mtx);
3567 BO_UNLOCK(bo);
3568
3569 return (0);
3570 }
3571
3572 /*
3573 * Check if vnode represents a disk device
3574 */
3575 int
3576 vn_isdisk(struct vnode *vp, int *errp)
3577 {
3578 int error;
3579
3580 error = 0;
3581 dev_lock();
3582 if (vp->v_type != VCHR)
3583 error = ENOTBLK;
3584 else if (vp->v_rdev == NULL)
3585 error = ENXIO;
3586 else if (vp->v_rdev->si_devsw == NULL)
3587 error = ENXIO;
3588 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3589 error = ENOTBLK;
3590 dev_unlock();
3591 if (errp != NULL)
3592 *errp = error;
3593 return (error == 0);
3594 }
3595
3596 /*
3597 * Common filesystem object access control check routine. Accepts a
3598 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3599 * and optional call-by-reference privused argument allowing vaccess()
3600 * to indicate to the caller whether privilege was used to satisfy the
3601 * request (obsoleted). Returns 0 on success, or an errno on failure.
3602 */
3603 int
3604 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3605 accmode_t accmode, struct ucred *cred, int *privused)
3606 {
3607 accmode_t dac_granted;
3608 accmode_t priv_granted;
3609
3610 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3611 ("invalid bit in accmode"));
3612 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3613 ("VAPPEND without VWRITE"));
3614
3615 /*
3616 * Look for a normal, non-privileged way to access the file/directory
3617 * as requested. If it exists, go with that.
3618 */
3619
3620 if (privused != NULL)
3621 *privused = 0;
3622
3623 dac_granted = 0;
3624
3625 /* Check the owner. */
3626 if (cred->cr_uid == file_uid) {
3627 dac_granted |= VADMIN;
3628 if (file_mode & S_IXUSR)
3629 dac_granted |= VEXEC;
3630 if (file_mode & S_IRUSR)
3631 dac_granted |= VREAD;
3632 if (file_mode & S_IWUSR)
3633 dac_granted |= (VWRITE | VAPPEND);
3634
3635 if ((accmode & dac_granted) == accmode)
3636 return (0);
3637
3638 goto privcheck;
3639 }
3640
3641 /* Otherwise, check the groups (first match) */
3642 if (groupmember(file_gid, cred)) {
3643 if (file_mode & S_IXGRP)
3644 dac_granted |= VEXEC;
3645 if (file_mode & S_IRGRP)
3646 dac_granted |= VREAD;
3647 if (file_mode & S_IWGRP)
3648 dac_granted |= (VWRITE | VAPPEND);
3649
3650 if ((accmode & dac_granted) == accmode)
3651 return (0);
3652
3653 goto privcheck;
3654 }
3655
3656 /* Otherwise, check everyone else. */
3657 if (file_mode & S_IXOTH)
3658 dac_granted |= VEXEC;
3659 if (file_mode & S_IROTH)
3660 dac_granted |= VREAD;
3661 if (file_mode & S_IWOTH)
3662 dac_granted |= (VWRITE | VAPPEND);
3663 if ((accmode & dac_granted) == accmode)
3664 return (0);
3665
3666 privcheck:
3667 /*
3668 * Build a privilege mask to determine if the set of privileges
3669 * satisfies the requirements when combined with the granted mask
3670 * from above. For each privilege, if the privilege is required,
3671 * bitwise or the request type onto the priv_granted mask.
3672 */
3673 priv_granted = 0;
3674
3675 if (type == VDIR) {
3676 /*
3677 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3678 * requests, instead of PRIV_VFS_EXEC.
3679 */
3680 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3681 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3682 priv_granted |= VEXEC;
3683 } else {
3684 /*
3685 * Ensure that at least one execute bit is on. Otherwise,
3686 * a privileged user will always succeed, and we don't want
3687 * this to happen unless the file really is executable.
3688 */
3689 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3690 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3691 !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3692 priv_granted |= VEXEC;
3693 }
3694
3695 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3696 !priv_check_cred(cred, PRIV_VFS_READ, 0))
3697 priv_granted |= VREAD;
3698
3699 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3700 !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3701 priv_granted |= (VWRITE | VAPPEND);
3702
3703 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3704 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3705 priv_granted |= VADMIN;
3706
3707 if ((accmode & (priv_granted | dac_granted)) == accmode) {
3708 /* XXX audit: privilege used */
3709 if (privused != NULL)
3710 *privused = 1;
3711 return (0);
3712 }
3713
3714 return ((accmode & VADMIN) ? EPERM : EACCES);
3715 }
3716
3717 /*
3718 * Credential check based on process requesting service, and per-attribute
3719 * permissions.
3720 */
3721 int
3722 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3723 struct thread *td, accmode_t accmode)
3724 {
3725
3726 /*
3727 * Kernel-invoked always succeeds.
3728 */
3729 if (cred == NOCRED)
3730 return (0);
3731
3732 /*
3733 * Do not allow privileged processes in jail to directly manipulate
3734 * system attributes.
3735 */
3736 switch (attrnamespace) {
3737 case EXTATTR_NAMESPACE_SYSTEM:
3738 /* Potentially should be: return (EPERM); */
3739 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3740 case EXTATTR_NAMESPACE_USER:
3741 return (VOP_ACCESS(vp, accmode, cred, td));
3742 default:
3743 return (EPERM);
3744 }
3745 }
3746
3747 #ifdef DEBUG_VFS_LOCKS
3748 /*
3749 * This only exists to supress warnings from unlocked specfs accesses. It is
3750 * no longer ok to have an unlocked VFS.
3751 */
3752 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \
3753 (vp)->v_type == VCHR || (vp)->v_type == VBAD)
3754
3755 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
3756 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3757 "Drop into debugger on lock violation");
3758
3759 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
3760 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3761 0, "Check for interlock across VOPs");
3762
3763 int vfs_badlock_print = 1; /* Print lock violations. */
3764 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
3765 0, "Print lock violations");
3766
3767 #ifdef KDB
3768 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
3769 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
3770 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
3771 #endif
3772
3773 static void
3774 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3775 {
3776
3777 #ifdef KDB
3778 if (vfs_badlock_backtrace)
3779 kdb_backtrace();
3780 #endif
3781 if (vfs_badlock_print)
3782 printf("%s: %p %s\n", str, (void *)vp, msg);
3783 if (vfs_badlock_ddb)
3784 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3785 }
3786
3787 void
3788 assert_vi_locked(struct vnode *vp, const char *str)
3789 {
3790
3791 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3792 vfs_badlock("interlock is not locked but should be", str, vp);
3793 }
3794
3795 void
3796 assert_vi_unlocked(struct vnode *vp, const char *str)
3797 {
3798
3799 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3800 vfs_badlock("interlock is locked but should not be", str, vp);
3801 }
3802
3803 void
3804 assert_vop_locked(struct vnode *vp, const char *str)
3805 {
3806
3807 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0)
3808 vfs_badlock("is not locked but should be", str, vp);
3809 }
3810
3811 void
3812 assert_vop_unlocked(struct vnode *vp, const char *str)
3813 {
3814
3815 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3816 vfs_badlock("is locked but should not be", str, vp);
3817 }
3818
3819 void
3820 assert_vop_elocked(struct vnode *vp, const char *str)
3821 {
3822
3823 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3824 vfs_badlock("is not exclusive locked but should be", str, vp);
3825 }
3826
3827 #if 0
3828 void
3829 assert_vop_elocked_other(struct vnode *vp, const char *str)
3830 {
3831
3832 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3833 vfs_badlock("is not exclusive locked by another thread",
3834 str, vp);
3835 }
3836
3837 void
3838 assert_vop_slocked(struct vnode *vp, const char *str)
3839 {
3840
3841 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
3842 vfs_badlock("is not locked shared but should be", str, vp);
3843 }
3844 #endif /* 0 */
3845 #endif /* DEBUG_VFS_LOCKS */
3846
3847 void
3848 vop_rename_fail(struct vop_rename_args *ap)
3849 {
3850
3851 if (ap->a_tvp != NULL)
3852 vput(ap->a_tvp);
3853 if (ap->a_tdvp == ap->a_tvp)
3854 vrele(ap->a_tdvp);
3855 else
3856 vput(ap->a_tdvp);
3857 vrele(ap->a_fdvp);
3858 vrele(ap->a_fvp);
3859 }
3860
3861 void
3862 vop_rename_pre(void *ap)
3863 {
3864 struct vop_rename_args *a = ap;
3865
3866 #ifdef DEBUG_VFS_LOCKS
3867 if (a->a_tvp)
3868 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3869 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3870 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3871 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3872
3873 /* Check the source (from). */
3874 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
3875 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
3876 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3877 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
3878 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
3879
3880 /* Check the target. */
3881 if (a->a_tvp)
3882 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3883 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3884 #endif
3885 if (a->a_tdvp != a->a_fdvp)
3886 vhold(a->a_fdvp);
3887 if (a->a_tvp != a->a_fvp)
3888 vhold(a->a_fvp);
3889 vhold(a->a_tdvp);
3890 if (a->a_tvp)
3891 vhold(a->a_tvp);
3892 }
3893
3894 void
3895 vop_strategy_pre(void *ap)
3896 {
3897 #ifdef DEBUG_VFS_LOCKS
3898 struct vop_strategy_args *a;
3899 struct buf *bp;
3900
3901 a = ap;
3902 bp = a->a_bp;
3903
3904 /*
3905 * Cluster ops lock their component buffers but not the IO container.
3906 */
3907 if ((bp->b_flags & B_CLUSTER) != 0)
3908 return;
3909
3910 if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
3911 if (vfs_badlock_print)
3912 printf(
3913 "VOP_STRATEGY: bp is not locked but should be\n");
3914 if (vfs_badlock_ddb)
3915 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3916 }
3917 #endif
3918 }
3919
3920 void
3921 vop_lookup_pre(void *ap)
3922 {
3923 #ifdef DEBUG_VFS_LOCKS
3924 struct vop_lookup_args *a;
3925 struct vnode *dvp;
3926
3927 a = ap;
3928 dvp = a->a_dvp;
3929 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3930 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3931 #endif
3932 }
3933
3934 void
3935 vop_lookup_post(void *ap, int rc)
3936 {
3937 #ifdef DEBUG_VFS_LOCKS
3938 struct vop_lookup_args *a;
3939 struct vnode *dvp;
3940 struct vnode *vp;
3941
3942 a = ap;
3943 dvp = a->a_dvp;
3944 vp = *(a->a_vpp);
3945
3946 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3947 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3948
3949 if (!rc)
3950 ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3951 #endif
3952 }
3953
3954 void
3955 vop_lock_pre(void *ap)
3956 {
3957 #ifdef DEBUG_VFS_LOCKS
3958 struct vop_lock1_args *a = ap;
3959
3960 if ((a->a_flags & LK_INTERLOCK) == 0)
3961 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3962 else
3963 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3964 #endif
3965 }
3966
3967 void
3968 vop_lock_post(void *ap, int rc)
3969 {
3970 #ifdef DEBUG_VFS_LOCKS
3971 struct vop_lock1_args *a = ap;
3972
3973 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3974 if (rc == 0)
3975 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3976 #endif
3977 }
3978
3979 void
3980 vop_unlock_pre(void *ap)
3981 {
3982 #ifdef DEBUG_VFS_LOCKS
3983 struct vop_unlock_args *a = ap;
3984
3985 if (a->a_flags & LK_INTERLOCK)
3986 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3987 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3988 #endif
3989 }
3990
3991 void
3992 vop_unlock_post(void *ap, int rc)
3993 {
3994 #ifdef DEBUG_VFS_LOCKS
3995 struct vop_unlock_args *a = ap;
3996
3997 if (a->a_flags & LK_INTERLOCK)
3998 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3999 #endif
4000 }
4001
4002 void
4003 vop_create_post(void *ap, int rc)
4004 {
4005 struct vop_create_args *a = ap;
4006
4007 if (!rc)
4008 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4009 }
4010
4011 void
4012 vop_deleteextattr_post(void *ap, int rc)
4013 {
4014 struct vop_deleteextattr_args *a = ap;
4015
4016 if (!rc)
4017 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4018 }
4019
4020 void
4021 vop_link_post(void *ap, int rc)
4022 {
4023 struct vop_link_args *a = ap;
4024
4025 if (!rc) {
4026 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4027 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4028 }
4029 }
4030
4031 void
4032 vop_mkdir_post(void *ap, int rc)
4033 {
4034 struct vop_mkdir_args *a = ap;
4035
4036 if (!rc)
4037 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4038 }
4039
4040 void
4041 vop_mknod_post(void *ap, int rc)
4042 {
4043 struct vop_mknod_args *a = ap;
4044
4045 if (!rc)
4046 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4047 }
4048
4049 void
4050 vop_remove_post(void *ap, int rc)
4051 {
4052 struct vop_remove_args *a = ap;
4053
4054 if (!rc) {
4055 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4056 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4057 }
4058 }
4059
4060 void
4061 vop_rename_post(void *ap, int rc)
4062 {
4063 struct vop_rename_args *a = ap;
4064
4065 if (!rc) {
4066 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4067 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4068 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4069 if (a->a_tvp)
4070 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4071 }
4072 if (a->a_tdvp != a->a_fdvp)
4073 vdrop(a->a_fdvp);
4074 if (a->a_tvp != a->a_fvp)
4075 vdrop(a->a_fvp);
4076 vdrop(a->a_tdvp);
4077 if (a->a_tvp)
4078 vdrop(a->a_tvp);
4079 }
4080
4081 void
4082 vop_rmdir_post(void *ap, int rc)
4083 {
4084 struct vop_rmdir_args *a = ap;
4085
4086 if (!rc) {
4087 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4088 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4089 }
4090 }
4091
4092 void
4093 vop_setattr_post(void *ap, int rc)
4094 {
4095 struct vop_setattr_args *a = ap;
4096
4097 if (!rc)
4098 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4099 }
4100
4101 void
4102 vop_setextattr_post(void *ap, int rc)
4103 {
4104 struct vop_setextattr_args *a = ap;
4105
4106 if (!rc)
4107 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4108 }
4109
4110 void
4111 vop_symlink_post(void *ap, int rc)
4112 {
4113 struct vop_symlink_args *a = ap;
4114
4115 if (!rc)
4116 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4117 }
4118
4119 static struct knlist fs_knlist;
4120
4121 static void
4122 vfs_event_init(void *arg)
4123 {
4124 knlist_init_mtx(&fs_knlist, NULL);
4125 }
4126 /* XXX - correct order? */
4127 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4128
4129 void
4130 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4131 {
4132
4133 KNOTE_UNLOCKED(&fs_knlist, event);
4134 }
4135
4136 static int filt_fsattach(struct knote *kn);
4137 static void filt_fsdetach(struct knote *kn);
4138 static int filt_fsevent(struct knote *kn, long hint);
4139
4140 struct filterops fs_filtops = {
4141 .f_isfd = 0,
4142 .f_attach = filt_fsattach,
4143 .f_detach = filt_fsdetach,
4144 .f_event = filt_fsevent
4145 };
4146
4147 static int
4148 filt_fsattach(struct knote *kn)
4149 {
4150
4151 kn->kn_flags |= EV_CLEAR;
4152 knlist_add(&fs_knlist, kn, 0);
4153 return (0);
4154 }
4155
4156 static void
4157 filt_fsdetach(struct knote *kn)
4158 {
4159
4160 knlist_remove(&fs_knlist, kn, 0);
4161 }
4162
4163 static int
4164 filt_fsevent(struct knote *kn, long hint)
4165 {
4166
4167 kn->kn_fflags |= hint;
4168 return (kn->kn_fflags != 0);
4169 }
4170
4171 static int
4172 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4173 {
4174 struct vfsidctl vc;
4175 int error;
4176 struct mount *mp;
4177
4178 error = SYSCTL_IN(req, &vc, sizeof(vc));
4179 if (error)
4180 return (error);
4181 if (vc.vc_vers != VFS_CTL_VERS1)
4182 return (EINVAL);
4183 mp = vfs_getvfs(&vc.vc_fsid);
4184 if (mp == NULL)
4185 return (ENOENT);
4186 /* ensure that a specific sysctl goes to the right filesystem. */
4187 if (strcmp(vc.vc_fstypename, "*") != 0 &&
4188 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4189 vfs_rel(mp);
4190 return (EINVAL);
4191 }
4192 VCTLTOREQ(&vc, req);
4193 error = VFS_SYSCTL(mp, vc.vc_op, req);
4194 vfs_rel(mp);
4195 return (error);
4196 }
4197
4198 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4199 NULL, 0, sysctl_vfs_ctl, "",
4200 "Sysctl by fsid");
4201
4202 /*
4203 * Function to initialize a va_filerev field sensibly.
4204 * XXX: Wouldn't a random number make a lot more sense ??
4205 */
4206 u_quad_t
4207 init_va_filerev(void)
4208 {
4209 struct bintime bt;
4210
4211 getbinuptime(&bt);
4212 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4213 }
4214
4215 static int filt_vfsread(struct knote *kn, long hint);
4216 static int filt_vfswrite(struct knote *kn, long hint);
4217 static int filt_vfsvnode(struct knote *kn, long hint);
4218 static void filt_vfsdetach(struct knote *kn);
4219 static struct filterops vfsread_filtops = {
4220 .f_isfd = 1,
4221 .f_detach = filt_vfsdetach,
4222 .f_event = filt_vfsread
4223 };
4224 static struct filterops vfswrite_filtops = {
4225 .f_isfd = 1,
4226 .f_detach = filt_vfsdetach,
4227 .f_event = filt_vfswrite
4228 };
4229 static struct filterops vfsvnode_filtops = {
4230 .f_isfd = 1,
4231 .f_detach = filt_vfsdetach,
4232 .f_event = filt_vfsvnode
4233 };
4234
4235 static void
4236 vfs_knllock(void *arg)
4237 {
4238 struct vnode *vp = arg;
4239
4240 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4241 }
4242
4243 static void
4244 vfs_knlunlock(void *arg)
4245 {
4246 struct vnode *vp = arg;
4247
4248 VOP_UNLOCK(vp, 0);
4249 }
4250
4251 static void
4252 vfs_knl_assert_locked(void *arg)
4253 {
4254 #ifdef DEBUG_VFS_LOCKS
4255 struct vnode *vp = arg;
4256
4257 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4258 #endif
4259 }
4260
4261 static void
4262 vfs_knl_assert_unlocked(void *arg)
4263 {
4264 #ifdef DEBUG_VFS_LOCKS
4265 struct vnode *vp = arg;
4266
4267 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4268 #endif
4269 }
4270
4271 int
4272 vfs_kqfilter(struct vop_kqfilter_args *ap)
4273 {
4274 struct vnode *vp = ap->a_vp;
4275 struct knote *kn = ap->a_kn;
4276 struct knlist *knl;
4277
4278 switch (kn->kn_filter) {
4279 case EVFILT_READ:
4280 kn->kn_fop = &vfsread_filtops;
4281 break;
4282 case EVFILT_WRITE:
4283 kn->kn_fop = &vfswrite_filtops;
4284 break;
4285 case EVFILT_VNODE:
4286 kn->kn_fop = &vfsvnode_filtops;
4287 break;
4288 default:
4289 return (EINVAL);
4290 }
4291
4292 kn->kn_hook = (caddr_t)vp;
4293
4294 v_addpollinfo(vp);
4295 if (vp->v_pollinfo == NULL)
4296 return (ENOMEM);
4297 knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4298 knlist_add(knl, kn, 0);
4299
4300 return (0);
4301 }
4302
4303 /*
4304 * Detach knote from vnode
4305 */
4306 static void
4307 filt_vfsdetach(struct knote *kn)
4308 {
4309 struct vnode *vp = (struct vnode *)kn->kn_hook;
4310
4311 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4312 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4313 }
4314
4315 /*ARGSUSED*/
4316 static int
4317 filt_vfsread(struct knote *kn, long hint)
4318 {
4319 struct vnode *vp = (struct vnode *)kn->kn_hook;
4320 struct vattr va;
4321 int res;
4322
4323 /*
4324 * filesystem is gone, so set the EOF flag and schedule
4325 * the knote for deletion.
4326 */
4327 if (hint == NOTE_REVOKE) {
4328 VI_LOCK(vp);
4329 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4330 VI_UNLOCK(vp);
4331 return (1);
4332 }
4333
4334 if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4335 return (0);
4336
4337 VI_LOCK(vp);
4338 kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4339 res = (kn->kn_data != 0);
4340 VI_UNLOCK(vp);
4341 return (res);
4342 }
4343
4344 /*ARGSUSED*/
4345 static int
4346 filt_vfswrite(struct knote *kn, long hint)
4347 {
4348 struct vnode *vp = (struct vnode *)kn->kn_hook;
4349
4350 VI_LOCK(vp);
4351
4352 /*
4353 * filesystem is gone, so set the EOF flag and schedule
4354 * the knote for deletion.
4355 */
4356 if (hint == NOTE_REVOKE)
4357 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4358
4359 kn->kn_data = 0;
4360 VI_UNLOCK(vp);
4361 return (1);
4362 }
4363
4364 static int
4365 filt_vfsvnode(struct knote *kn, long hint)
4366 {
4367 struct vnode *vp = (struct vnode *)kn->kn_hook;
4368 int res;
4369
4370 VI_LOCK(vp);
4371 if (kn->kn_sfflags & hint)
4372 kn->kn_fflags |= hint;
4373 if (hint == NOTE_REVOKE) {
4374 kn->kn_flags |= EV_EOF;
4375 VI_UNLOCK(vp);
4376 return (1);
4377 }
4378 res = (kn->kn_fflags != 0);
4379 VI_UNLOCK(vp);
4380 return (res);
4381 }
4382
4383 int
4384 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4385 {
4386 int error;
4387
4388 if (dp->d_reclen > ap->a_uio->uio_resid)
4389 return (ENAMETOOLONG);
4390 error = uiomove(dp, dp->d_reclen, ap->a_uio);
4391 if (error) {
4392 if (ap->a_ncookies != NULL) {
4393 if (ap->a_cookies != NULL)
4394 free(ap->a_cookies, M_TEMP);
4395 ap->a_cookies = NULL;
4396 *ap->a_ncookies = 0;
4397 }
4398 return (error);
4399 }
4400 if (ap->a_ncookies == NULL)
4401 return (0);
4402
4403 KASSERT(ap->a_cookies,
4404 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4405
4406 *ap->a_cookies = realloc(*ap->a_cookies,
4407 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4408 (*ap->a_cookies)[*ap->a_ncookies] = off;
4409 return (0);
4410 }
4411
4412 /*
4413 * Mark for update the access time of the file if the filesystem
4414 * supports VOP_MARKATIME. This functionality is used by execve and
4415 * mmap, so we want to avoid the I/O implied by directly setting
4416 * va_atime for the sake of efficiency.
4417 */
4418 void
4419 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4420 {
4421 struct mount *mp;
4422
4423 mp = vp->v_mount;
4424 VFS_ASSERT_GIANT(mp);
4425 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4426 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4427 (void)VOP_MARKATIME(vp);
4428 }
4429
4430 /*
4431 * The purpose of this routine is to remove granularity from accmode_t,
4432 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4433 * VADMIN and VAPPEND.
4434 *
4435 * If it returns 0, the caller is supposed to continue with the usual
4436 * access checks using 'accmode' as modified by this routine. If it
4437 * returns nonzero value, the caller is supposed to return that value
4438 * as errno.
4439 *
4440 * Note that after this routine runs, accmode may be zero.
4441 */
4442 int
4443 vfs_unixify_accmode(accmode_t *accmode)
4444 {
4445 /*
4446 * There is no way to specify explicit "deny" rule using
4447 * file mode or POSIX.1e ACLs.
4448 */
4449 if (*accmode & VEXPLICIT_DENY) {
4450 *accmode = 0;
4451 return (0);
4452 }
4453
4454 /*
4455 * None of these can be translated into usual access bits.
4456 * Also, the common case for NFSv4 ACLs is to not contain
4457 * either of these bits. Caller should check for VWRITE
4458 * on the containing directory instead.
4459 */
4460 if (*accmode & (VDELETE_CHILD | VDELETE))
4461 return (EPERM);
4462
4463 if (*accmode & VADMIN_PERMS) {
4464 *accmode &= ~VADMIN_PERMS;
4465 *accmode |= VADMIN;
4466 }
4467
4468 /*
4469 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4470 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4471 */
4472 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4473
4474 return (0);
4475 }
4476
4477 /*
4478 * These are helper functions for filesystems to traverse all
4479 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4480 *
4481 * This interface replaces MNT_VNODE_FOREACH.
4482 */
4483
4484 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4485
4486 struct vnode *
4487 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4488 {
4489 struct vnode *vp;
4490
4491 if (should_yield())
4492 kern_yield(PRI_UNCHANGED);
4493 MNT_ILOCK(mp);
4494 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4495 vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4496 while (vp != NULL && (vp->v_type == VMARKER ||
4497 (vp->v_iflag & VI_DOOMED) != 0))
4498 vp = TAILQ_NEXT(vp, v_nmntvnodes);
4499
4500 /* Check if we are done */
4501 if (vp == NULL) {
4502 __mnt_vnode_markerfree_all(mvp, mp);
4503 /* MNT_IUNLOCK(mp); -- done in above function */
4504 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4505 return (NULL);
4506 }
4507 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4508 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4509 VI_LOCK(vp);
4510 MNT_IUNLOCK(mp);
4511 return (vp);
4512 }
4513
4514 struct vnode *
4515 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4516 {
4517 struct vnode *vp;
4518
4519 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4520 MNT_ILOCK(mp);
4521 MNT_REF(mp);
4522 (*mvp)->v_type = VMARKER;
4523
4524 vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4525 while (vp != NULL && (vp->v_type == VMARKER ||
4526 (vp->v_iflag & VI_DOOMED) != 0))
4527 vp = TAILQ_NEXT(vp, v_nmntvnodes);
4528
4529 /* Check if we are done */
4530 if (vp == NULL) {
4531 *mvp = NULL;
4532 MNT_REL(mp);
4533 MNT_IUNLOCK(mp);
4534 free(*mvp, M_VNODE_MARKER);
4535 return (NULL);
4536 }
4537 (*mvp)->v_mount = mp;
4538 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4539 VI_LOCK(vp);
4540 MNT_IUNLOCK(mp);
4541 return (vp);
4542 }
4543
4544
4545 void
4546 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4547 {
4548
4549 if (*mvp == NULL) {
4550 MNT_IUNLOCK(mp);
4551 return;
4552 }
4553
4554 mtx_assert(MNT_MTX(mp), MA_OWNED);
4555
4556 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4557 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4558 MNT_REL(mp);
4559 MNT_IUNLOCK(mp);
4560 free(*mvp, M_VNODE_MARKER);
4561 *mvp = NULL;
4562 }

Properties

Name Value
svn:keywords FreeBSD=%H

  ViewVC Help
Powered by ViewVC 1.1.27