Print this page
5384 pvn_getpages may assert in valid scenarios
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_pvn.c
+++ new/usr/src/uts/common/vm/vm_pvn.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23 24 */
24 25
25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 27 /* All Rights Reserved */
27 28
28 29 /*
29 30 * University Copyright- Copyright (c) 1982, 1986, 1988
30 31 * The Regents of the University of California
31 32 * All Rights Reserved
32 33 *
33 34 * University Acknowledgment- Portions of this document are derived from
34 35 * software developed by the University of California, Berkeley, and its
35 36 * contributors.
36 37 */
37 38
38 39 /*
39 40 * VM - paged vnode.
40 41 *
41 42 * This file supplies vm support for the vnode operations that deal with pages.
42 43 */
43 44 #include <sys/types.h>
44 45 #include <sys/t_lock.h>
45 46 #include <sys/param.h>
46 47 #include <sys/sysmacros.h>
47 48 #include <sys/systm.h>
48 49 #include <sys/time.h>
49 50 #include <sys/buf.h>
50 51 #include <sys/vnode.h>
51 52 #include <sys/uio.h>
52 53 #include <sys/vmsystm.h>
53 54 #include <sys/mman.h>
54 55 #include <sys/vfs.h>
55 56 #include <sys/cred.h>
56 57 #include <sys/user.h>
57 58 #include <sys/kmem.h>
58 59 #include <sys/cmn_err.h>
59 60 #include <sys/debug.h>
60 61 #include <sys/cpuvar.h>
61 62 #include <sys/vtrace.h>
62 63 #include <sys/tnf_probe.h>
63 64
64 65 #include <vm/hat.h>
65 66 #include <vm/as.h>
66 67 #include <vm/seg.h>
67 68 #include <vm/rm.h>
68 69 #include <vm/pvn.h>
69 70 #include <vm/page.h>
70 71 #include <vm/seg_map.h>
71 72 #include <vm/seg_kmem.h>
72 73 #include <sys/fs/swapnode.h>
73 74
74 75 int pvn_nofodklust = 0;
75 76 int pvn_write_noklust = 0;
76 77
77 78 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */
78 79 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */
79 80 /* support for vmodsort for testing */
80 81
81 82 static struct kmem_cache *marker_cache = NULL;
82 83
83 84 /*
84 85 * Find the largest contiguous block which contains `addr' for file offset
85 86 * `offset' in it while living within the file system block sizes (`vp_off'
86 87 * and `vp_len') and the address space limits for which no pages currently
87 88 * exist and which map to consecutive file offsets.
88 89 */
89 90 page_t *
90 91 pvn_read_kluster(
91 92 struct vnode *vp,
92 93 u_offset_t off,
93 94 struct seg *seg,
94 95 caddr_t addr,
95 96 u_offset_t *offp, /* return values */
96 97 size_t *lenp, /* return values */
97 98 u_offset_t vp_off,
98 99 size_t vp_len,
99 100 int isra)
100 101 {
101 102 ssize_t deltaf, deltab;
102 103 page_t *pp;
103 104 page_t *plist = NULL;
104 105 spgcnt_t pagesavail;
105 106 u_offset_t vp_end;
106 107
107 108 ASSERT(off >= vp_off && off < vp_off + vp_len);
108 109
109 110 /*
110 111 * We only want to do klustering/read ahead if there
111 112 * is more than minfree pages currently available.
112 113 */
113 114 pagesavail = freemem - minfree;
114 115
115 116 if (pagesavail <= 0)
116 117 if (isra)
117 118 return ((page_t *)NULL); /* ra case - give up */
118 119 else
119 120 pagesavail = 1; /* must return a page */
120 121
121 122 /* We calculate in pages instead of bytes due to 32-bit overflows */
122 123 if (pagesavail < (spgcnt_t)btopr(vp_len)) {
123 124 /*
124 125 * Don't have enough free memory for the
125 126 * max request, try sizing down vp request.
126 127 */
127 128 deltab = (ssize_t)(off - vp_off);
128 129 vp_len -= deltab;
129 130 vp_off += deltab;
130 131 if (pagesavail < btopr(vp_len)) {
131 132 /*
132 133 * Still not enough memory, just settle for
133 134 * pagesavail which is at least 1.
134 135 */
135 136 vp_len = ptob(pagesavail);
136 137 }
137 138 }
138 139
139 140 vp_end = vp_off + vp_len;
140 141 ASSERT(off >= vp_off && off < vp_end);
141 142
142 143 if (isra && SEGOP_KLUSTER(seg, addr, 0))
143 144 return ((page_t *)NULL); /* segment driver says no */
144 145
145 146 if ((plist = page_create_va(vp, off,
146 147 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
147 148 return ((page_t *)NULL);
148 149
149 150 if (vp_len <= PAGESIZE || pvn_nofodklust) {
150 151 *offp = off;
151 152 *lenp = MIN(vp_len, PAGESIZE);
152 153 } else {
153 154 /*
154 155 * Scan back from front by incrementing "deltab" and
155 156 * comparing "off" with "vp_off + deltab" to avoid
156 157 * "signed" versus "unsigned" conversion problems.
157 158 */
158 159 for (deltab = PAGESIZE; off >= vp_off + deltab;
159 160 deltab += PAGESIZE) {
160 161 /*
161 162 * Call back to the segment driver to verify that
162 163 * the klustering/read ahead operation makes sense.
163 164 */
164 165 if (SEGOP_KLUSTER(seg, addr, -deltab))
165 166 break; /* page not eligible */
166 167 if ((pp = page_create_va(vp, off - deltab,
167 168 PAGESIZE, PG_EXCL, seg, addr - deltab))
168 169 == NULL)
169 170 break; /* already have the page */
170 171 /*
171 172 * Add page to front of page list.
172 173 */
173 174 page_add(&plist, pp);
174 175 }
175 176 deltab -= PAGESIZE;
176 177
177 178 /* scan forward from front */
178 179 for (deltaf = PAGESIZE; off + deltaf < vp_end;
179 180 deltaf += PAGESIZE) {
180 181 /*
181 182 * Call back to the segment driver to verify that
182 183 * the klustering/read ahead operation makes sense.
183 184 */
184 185 if (SEGOP_KLUSTER(seg, addr, deltaf))
185 186 break; /* page not file extension */
186 187 if ((pp = page_create_va(vp, off + deltaf,
187 188 PAGESIZE, PG_EXCL, seg, addr + deltaf))
188 189 == NULL)
189 190 break; /* already have page */
190 191
191 192 /*
192 193 * Add page to end of page list.
193 194 */
194 195 page_add(&plist, pp);
195 196 plist = plist->p_next;
196 197 }
197 198 *offp = off = off - deltab;
198 199 *lenp = deltab + deltaf;
199 200 ASSERT(off >= vp_off);
200 201
201 202 /*
202 203 * If we ended up getting more than was actually
203 204 * requested, retract the returned length to only
204 205 * reflect what was requested. This might happen
205 206 * if we were allowed to kluster pages across a
206 207 * span of (say) 5 frags, and frag size is less
207 208 * than PAGESIZE. We need a whole number of
208 209 * pages to contain those frags, but the returned
209 210 * size should only allow the returned range to
210 211 * extend as far as the end of the frags.
211 212 */
212 213 if ((vp_off + vp_len) < (off + *lenp)) {
213 214 ASSERT(vp_end > off);
214 215 *lenp = vp_end - off;
215 216 }
216 217 }
217 218 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
218 219 "pvn_read_kluster:seg %p addr %x isra %x",
219 220 seg, addr, isra);
220 221 return (plist);
221 222 }
222 223
223 224 /*
224 225 * Handle pages for this vnode on either side of the page "pp"
225 226 * which has been locked by the caller. This routine will also
226 227 * do klustering in the range [vp_off, vp_off + vp_len] up
227 228 * until a page which is not found. The offset and length
228 229 * of pages included is returned in "*offp" and "*lenp".
229 230 *
230 231 * Returns a list of dirty locked pages all ready to be
231 232 * written back.
232 233 */
233 234 page_t *
234 235 pvn_write_kluster(
235 236 struct vnode *vp,
236 237 page_t *pp,
237 238 u_offset_t *offp, /* return values */
238 239 size_t *lenp, /* return values */
239 240 u_offset_t vp_off,
240 241 size_t vp_len,
241 242 int flags)
242 243 {
243 244 u_offset_t off;
244 245 page_t *dirty;
245 246 size_t deltab, deltaf;
246 247 se_t se;
247 248 u_offset_t vp_end;
248 249
249 250 off = pp->p_offset;
250 251
251 252 /*
252 253 * Kustering should not be done if we are invalidating
253 254 * pages since we could destroy pages that belong to
254 255 * some other process if this is a swap vnode.
255 256 */
256 257 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
257 258 *offp = off;
258 259 *lenp = PAGESIZE;
259 260 return (pp);
260 261 }
261 262
262 263 if (flags & (B_FREE | B_INVAL))
263 264 se = SE_EXCL;
264 265 else
265 266 se = SE_SHARED;
266 267
267 268 dirty = pp;
268 269 /*
269 270 * Scan backwards looking for pages to kluster by incrementing
270 271 * "deltab" and comparing "off" with "vp_off + deltab" to
271 272 * avoid "signed" versus "unsigned" conversion problems.
272 273 */
273 274 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
274 275 pp = page_lookup_nowait(vp, off - deltab, se);
275 276 if (pp == NULL)
276 277 break; /* page not found */
277 278 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
278 279 break;
279 280 page_add(&dirty, pp);
280 281 }
281 282 deltab -= PAGESIZE;
282 283
283 284 vp_end = vp_off + vp_len;
284 285 /* now scan forwards looking for pages to kluster */
285 286 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
286 287 pp = page_lookup_nowait(vp, off + deltaf, se);
287 288 if (pp == NULL)
288 289 break; /* page not found */
289 290 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
290 291 break;
291 292 page_add(&dirty, pp);
292 293 dirty = dirty->p_next;
293 294 }
294 295
295 296 *offp = off - deltab;
296 297 *lenp = deltab + deltaf;
297 298 return (dirty);
298 299 }
299 300
300 301 /*
301 302 * Generic entry point used to release the "shared/exclusive" lock
302 303 * and the "p_iolock" on pages after i/o is complete.
303 304 */
304 305 void
305 306 pvn_io_done(page_t *plist)
306 307 {
307 308 page_t *pp;
308 309
309 310 while (plist != NULL) {
310 311 pp = plist;
311 312 page_sub(&plist, pp);
312 313 page_io_unlock(pp);
313 314 page_unlock(pp);
314 315 }
315 316 }
316 317
317 318 /*
318 319 * Entry point to be used by file system getpage subr's and
319 320 * other such routines which either want to unlock pages (B_ASYNC
320 321 * request) or destroy a list of pages if an error occurred.
321 322 */
322 323 void
323 324 pvn_read_done(page_t *plist, int flags)
324 325 {
325 326 page_t *pp;
326 327
327 328 while (plist != NULL) {
328 329 pp = plist;
329 330 page_sub(&plist, pp);
330 331 page_io_unlock(pp);
331 332 if (flags & B_ERROR) {
332 333 /*LINTED: constant in conditional context*/
333 334 VN_DISPOSE(pp, B_INVAL, 0, kcred);
334 335 } else {
335 336 (void) page_release(pp, 0);
336 337 }
337 338 }
338 339 }
339 340
340 341 /*
341 342 * Automagic pageout.
342 343 * When memory gets tight, start freeing pages popping out of the
343 344 * write queue.
344 345 */
345 346 int write_free = 1;
346 347 pgcnt_t pages_before_pager = 200; /* LMXXX */
347 348
348 349 /*
349 350 * Routine to be called when page-out's complete.
350 351 * The caller, typically VOP_PUTPAGE, has to explicity call this routine
351 352 * after waiting for i/o to complete (biowait) to free the list of
352 353 * pages associated with the buffer. These pages must be locked
353 354 * before i/o is initiated.
354 355 *
355 356 * If a write error occurs, the pages are marked as modified
356 357 * so the write will be re-tried later.
357 358 */
358 359
359 360 void
360 361 pvn_write_done(page_t *plist, int flags)
361 362 {
362 363 int dfree = 0;
363 364 int pgrec = 0;
364 365 int pgout = 0;
365 366 int pgpgout = 0;
366 367 int anonpgout = 0;
367 368 int anonfree = 0;
368 369 int fspgout = 0;
369 370 int fsfree = 0;
370 371 int execpgout = 0;
371 372 int execfree = 0;
372 373 page_t *pp;
373 374 struct cpu *cpup;
374 375 struct vnode *vp = NULL; /* for probe */
375 376 uint_t ppattr;
376 377 kmutex_t *vphm = NULL;
377 378
378 379 ASSERT((flags & B_READ) == 0);
379 380
380 381 /*
381 382 * If we are about to start paging anyway, start freeing pages.
382 383 */
383 384 if (write_free && freemem < lotsfree + pages_before_pager &&
384 385 (flags & B_ERROR) == 0) {
385 386 flags |= B_FREE;
386 387 }
387 388
388 389 /*
389 390 * Handle each page involved in the i/o operation.
390 391 */
391 392 while (plist != NULL) {
392 393 pp = plist;
393 394 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
394 395 page_sub(&plist, pp);
395 396
396 397 /* Kernel probe support */
397 398 if (vp == NULL)
398 399 vp = pp->p_vnode;
399 400
400 401 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
401 402 /*
402 403 * Move page to the top of the v_page list.
403 404 * Skip pages modified during IO.
404 405 */
405 406 vphm = page_vnode_mutex(vp);
406 407 mutex_enter(vphm);
407 408 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
408 409 page_vpsub(&vp->v_pages, pp);
409 410 page_vpadd(&vp->v_pages, pp);
410 411 }
411 412 mutex_exit(vphm);
412 413 }
413 414
414 415 if (flags & B_ERROR) {
415 416 /*
416 417 * Write operation failed. We don't want
417 418 * to destroy (or free) the page unless B_FORCE
418 419 * is set. We set the mod bit again and release
419 420 * all locks on the page so that it will get written
420 421 * back again later when things are hopefully
421 422 * better again.
422 423 * If B_INVAL and B_FORCE is set we really have
423 424 * to destroy the page.
424 425 */
425 426 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
426 427 page_io_unlock(pp);
427 428 /*LINTED: constant in conditional context*/
428 429 VN_DISPOSE(pp, B_INVAL, 0, kcred);
429 430 } else {
430 431 hat_setmod_only(pp);
431 432 page_io_unlock(pp);
432 433 page_unlock(pp);
433 434 }
434 435 } else if (flags & B_INVAL) {
435 436 /*
436 437 * XXX - Failed writes with B_INVAL set are
437 438 * not handled appropriately.
438 439 */
439 440 page_io_unlock(pp);
440 441 /*LINTED: constant in conditional context*/
441 442 VN_DISPOSE(pp, B_INVAL, 0, kcred);
442 443 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
443 444 /*
444 445 * Update statistics for pages being paged out
445 446 */
446 447 if (pp->p_vnode) {
447 448 if (IS_SWAPFSVP(pp->p_vnode)) {
448 449 anonpgout++;
449 450 } else {
450 451 if (pp->p_vnode->v_flag & VVMEXEC) {
451 452 execpgout++;
452 453 } else {
453 454 fspgout++;
454 455 }
455 456 }
456 457 }
457 458 page_io_unlock(pp);
458 459 pgout = 1;
459 460 pgpgout++;
460 461 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
461 462 "page_ws_out:pp %p", pp);
462 463
463 464 /*
464 465 * The page_struct_lock need not be acquired to
465 466 * examine "p_lckcnt" and "p_cowcnt" since we'll
466 467 * have an "exclusive" lock if the upgrade succeeds.
467 468 */
468 469 if (page_tryupgrade(pp) &&
469 470 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
470 471 /*
471 472 * Check if someone has reclaimed the
472 473 * page. If ref and mod are not set, no
473 474 * one is using it so we can free it.
474 475 * The rest of the system is careful
475 476 * to use the NOSYNC flag to unload
476 477 * translations set up for i/o w/o
477 478 * affecting ref and mod bits.
478 479 *
479 480 * Obtain a copy of the real hardware
480 481 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
481 482 * to avoid having to flush the cache.
482 483 */
483 484 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
484 485 HAT_SYNC_STOPON_MOD);
485 486 ck_refmod:
486 487 if (!(ppattr & (P_REF | P_MOD))) {
487 488 if (hat_page_is_mapped(pp)) {
488 489 /*
489 490 * Doesn't look like the page
490 491 * was modified so now we
491 492 * really have to unload the
492 493 * translations. Meanwhile
493 494 * another CPU could've
494 495 * modified it so we have to
495 496 * check again. We don't loop
496 497 * forever here because now
497 498 * the translations are gone
498 499 * and no one can get a new one
499 500 * since we have the "exclusive"
500 501 * lock on the page.
501 502 */
502 503 (void) hat_pageunload(pp,
503 504 HAT_FORCE_PGUNLOAD);
504 505 ppattr = hat_page_getattr(pp,
505 506 P_REF | P_MOD);
506 507 goto ck_refmod;
507 508 }
508 509 /*
509 510 * Update statistics for pages being
510 511 * freed
511 512 */
512 513 if (pp->p_vnode) {
513 514 if (IS_SWAPFSVP(pp->p_vnode)) {
514 515 anonfree++;
515 516 } else {
516 517 if (pp->p_vnode->v_flag
517 518 & VVMEXEC) {
518 519 execfree++;
519 520 } else {
520 521 fsfree++;
521 522 }
522 523 }
523 524 }
524 525 /*LINTED: constant in conditional ctx*/
525 526 VN_DISPOSE(pp, B_FREE,
526 527 (flags & B_DONTNEED), kcred);
527 528 dfree++;
528 529 } else {
529 530 page_unlock(pp);
530 531 pgrec++;
531 532 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
532 533 "page_ws_free:pp %p", pp);
533 534 }
534 535 } else {
535 536 /*
536 537 * Page is either `locked' in memory
537 538 * or was reclaimed and now has a
538 539 * "shared" lock, so release it.
539 540 */
540 541 page_unlock(pp);
541 542 }
542 543 } else {
543 544 /*
544 545 * Neither B_FREE nor B_INVAL nor B_ERROR.
545 546 * Just release locks.
546 547 */
547 548 page_io_unlock(pp);
548 549 page_unlock(pp);
549 550 }
550 551 }
551 552
552 553 CPU_STATS_ENTER_K();
553 554 cpup = CPU; /* get cpup now that CPU cannot change */
554 555 CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
555 556 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
556 557 CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
557 558 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
558 559 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
559 560 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
560 561 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
561 562 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
562 563 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
563 564 CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
564 565 CPU_STATS_EXIT_K();
565 566
566 567 /* Kernel probe */
567 568 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
568 569 tnf_opaque, vnode, vp,
569 570 tnf_ulong, pages_pageout, pgpgout,
570 571 tnf_ulong, pages_freed, dfree,
571 572 tnf_ulong, pages_reclaimed, pgrec);
572 573 }
573 574
574 575 /*
575 576 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
576 577 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
577 578 * operation and is only to be considered if it doesn't involve any
578 579 * waiting here. B_TRUNC indicates that the file is being truncated
579 580 * and so no i/o needs to be done. B_FORCE indicates that the page
580 581 * must be destroyed so don't try wrting it out.
581 582 *
582 583 * The caller must ensure that the page is locked. Returns 1, if
583 584 * the page should be written back (the "iolock" is held in this
584 585 * case), or 0 if the page has been dealt with or has been
585 586 * unlocked.
586 587 */
587 588 int
588 589 pvn_getdirty(page_t *pp, int flags)
589 590 {
590 591 ASSERT((flags & (B_INVAL | B_FREE)) ?
591 592 PAGE_EXCL(pp) : PAGE_SHARED(pp));
592 593 ASSERT(PP_ISFREE(pp) == 0);
593 594
594 595 /*
595 596 * If trying to invalidate or free a logically `locked' page,
596 597 * forget it. Don't need page_struct_lock to check p_lckcnt and
597 598 * p_cowcnt as the page is exclusively locked.
598 599 */
599 600 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
600 601 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
601 602 page_unlock(pp);
602 603 return (0);
603 604 }
604 605
605 606 /*
606 607 * Now acquire the i/o lock so we can add it to the dirty
607 608 * list (if necessary). We avoid blocking on the i/o lock
608 609 * in the following cases:
609 610 *
610 611 * If B_DELWRI is set, which implies that this request is
611 612 * due to a klustering operartion.
612 613 *
613 614 * If this is an async (B_ASYNC) operation and we are not doing
614 615 * invalidation (B_INVAL) [The current i/o or fsflush will ensure
615 616 * that the the page is written out].
616 617 */
617 618 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
618 619 if (!page_io_trylock(pp)) {
619 620 page_unlock(pp);
620 621 return (0);
621 622 }
622 623 } else {
623 624 page_io_lock(pp);
624 625 }
625 626
626 627 /*
627 628 * If we want to free or invalidate the page then
628 629 * we need to unload it so that anyone who wants
629 630 * it will have to take a minor fault to get it.
630 631 * Otherwise, we're just writing the page back so we
631 632 * need to sync up the hardwre and software mod bit to
632 633 * detect any future modifications. We clear the
633 634 * software mod bit when we put the page on the dirty
634 635 * list.
635 636 */
636 637 if (flags & (B_INVAL | B_FREE)) {
637 638 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
638 639 } else {
639 640 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
640 641 }
641 642
642 643 if (!hat_ismod(pp) || (flags & B_TRUNC)) {
643 644 /*
644 645 * Don't need to add it to the
645 646 * list after all.
646 647 */
647 648 page_io_unlock(pp);
648 649 if (flags & B_INVAL) {
649 650 /*LINTED: constant in conditional context*/
650 651 VN_DISPOSE(pp, B_INVAL, 0, kcred);
651 652 } else if (flags & B_FREE) {
652 653 /*LINTED: constant in conditional context*/
653 654 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
654 655 } else {
655 656 /*
656 657 * This is advisory path for the callers
657 658 * of VOP_PUTPAGE() who prefer freeing the
658 659 * page _only_ if no one else is accessing it.
659 660 * E.g. segmap_release()
660 661 *
661 662 * The above hat_ismod() check is useless because:
662 663 * (1) we may not be holding SE_EXCL lock;
663 664 * (2) we've not unloaded _all_ translations
664 665 *
665 666 * Let page_release() do the heavy-lifting.
666 667 */
667 668 (void) page_release(pp, 1);
668 669 }
669 670 return (0);
670 671 }
671 672
672 673 /*
673 674 * Page is dirty, get it ready for the write back
674 675 * and add page to the dirty list.
675 676 */
676 677 hat_clrrefmod(pp);
677 678
678 679 /*
679 680 * If we're going to free the page when we're done
680 681 * then we can let others try to use it starting now.
681 682 * We'll detect the fact that they used it when the
682 683 * i/o is done and avoid freeing the page.
683 684 */
684 685 if (flags & B_FREE)
685 686 page_downgrade(pp);
686 687
687 688
688 689 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
689 690
690 691 return (1);
691 692 }
692 693
693 694
694 695 /*ARGSUSED*/
695 696 static int
696 697 marker_constructor(void *buf, void *cdrarg, int kmflags)
697 698 {
698 699 page_t *mark = buf;
699 700 bzero(mark, sizeof (page_t));
700 701 mark->p_hash = PVN_VPLIST_HASH_TAG;
701 702 return (0);
702 703 }
703 704
704 705 void
705 706 pvn_init()
706 707 {
707 708 if (pvn_vmodsort_disable == 0)
708 709 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
709 710 marker_cache = kmem_cache_create("marker_cache",
710 711 sizeof (page_t), 0, marker_constructor,
711 712 NULL, NULL, NULL, NULL, 0);
712 713 }
713 714
714 715
715 716 /*
716 717 * Process a vnode's page list for all pages whose offset is >= off.
717 718 * Pages are to either be free'd, invalidated, or written back to disk.
718 719 *
719 720 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
720 721 * is specified, otherwise they are "shared" locked.
721 722 *
722 723 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
723 724 *
724 725 * Special marker page_t's are inserted in the list in order
725 726 * to keep track of where we are in the list when locks are dropped.
726 727 *
727 728 * Note the list is circular and insertions can happen only at the
728 729 * head and tail of the list. The algorithm ensures visiting all pages
729 730 * on the list in the following way:
730 731 *
731 732 * Drop two marker pages at the end of the list.
732 733 *
733 734 * Move one marker page backwards towards the start of the list until
734 735 * it is at the list head, processing the pages passed along the way.
735 736 *
736 737 * Due to race conditions when the vphm mutex is dropped, additional pages
737 738 * can be added to either end of the list, so we'll continue to move
738 739 * the marker and process pages until it is up against the end marker.
739 740 *
740 741 * There is one special exit condition. If we are processing a VMODSORT
741 742 * vnode and only writing back modified pages, we can stop as soon as
742 743 * we run into an unmodified page. This makes fsync(3) operations fast.
743 744 */
744 745 int
745 746 pvn_vplist_dirty(
746 747 vnode_t *vp,
747 748 u_offset_t off,
748 749 int (*putapage)(vnode_t *, page_t *, u_offset_t *,
749 750 size_t *, int, cred_t *),
750 751 int flags,
751 752 cred_t *cred)
752 753 {
753 754 page_t *pp;
754 755 page_t *mark; /* marker page that moves toward head */
755 756 page_t *end; /* marker page at end of list */
756 757 int err = 0;
757 758 int error;
758 759 kmutex_t *vphm;
759 760 se_t se;
760 761 page_t **where_to_move;
761 762
762 763 ASSERT(vp->v_type != VCHR);
763 764
764 765 if (vp->v_pages == NULL)
765 766 return (0);
766 767
767 768
768 769 /*
769 770 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
770 771 *
771 772 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
772 773 * from getting blocked while flushing pages to a dead NFS server.
773 774 */
774 775 mutex_enter(&vp->v_lock);
775 776 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
776 777 mutex_exit(&vp->v_lock);
777 778 return (EAGAIN);
778 779 }
779 780
780 781 while (vp->v_flag & VVMLOCK)
781 782 cv_wait(&vp->v_cv, &vp->v_lock);
782 783
783 784 if (vp->v_pages == NULL) {
784 785 mutex_exit(&vp->v_lock);
785 786 return (0);
786 787 }
787 788
788 789 vp->v_flag |= VVMLOCK;
789 790 mutex_exit(&vp->v_lock);
790 791
791 792
792 793 /*
793 794 * Set up the marker pages used to walk the list
794 795 */
795 796 end = kmem_cache_alloc(marker_cache, KM_SLEEP);
796 797 end->p_vnode = vp;
797 798 end->p_offset = (u_offset_t)-2;
798 799 mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
799 800 mark->p_vnode = vp;
800 801 mark->p_offset = (u_offset_t)-1;
801 802
802 803 /*
803 804 * Grab the lock protecting the vnode's page list
804 805 * note that this lock is dropped at times in the loop.
805 806 */
806 807 vphm = page_vnode_mutex(vp);
807 808 mutex_enter(vphm);
808 809 if (vp->v_pages == NULL)
809 810 goto leave;
810 811
811 812 /*
812 813 * insert the markers and loop through the list of pages
813 814 */
814 815 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
815 816 page_vpadd(&mark->p_vpnext, end);
816 817 for (;;) {
817 818
818 819 /*
819 820 * If only doing an async write back, then we can
820 821 * stop as soon as we get to start of the list.
821 822 */
822 823 if (flags == B_ASYNC && vp->v_pages == mark)
823 824 break;
824 825
825 826 /*
826 827 * otherwise stop when we've gone through all the pages
827 828 */
828 829 if (mark->p_vpprev == end)
829 830 break;
830 831
831 832 pp = mark->p_vpprev;
832 833 if (vp->v_pages == pp)
833 834 where_to_move = &vp->v_pages;
834 835 else
835 836 where_to_move = &pp->p_vpprev->p_vpnext;
836 837
837 838 ASSERT(pp->p_vnode == vp);
838 839
839 840 /*
840 841 * If just flushing dirty pages to disk and this vnode
841 842 * is using a sorted list of pages, we can stop processing
842 843 * as soon as we find an unmodified page. Since all the
843 844 * modified pages are visited first.
844 845 */
845 846 if (IS_VMODSORT(vp) &&
846 847 !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
847 848 if (!hat_ismod(pp) && !page_io_locked(pp)) {
848 849 #ifdef DEBUG
849 850 /*
850 851 * For debug kernels examine what should be
851 852 * all the remaining clean pages, asserting
852 853 * that they are not modified.
853 854 */
854 855 page_t *chk = pp;
855 856 int attr;
856 857
857 858 page_vpsub(&vp->v_pages, mark);
858 859 page_vpadd(where_to_move, mark);
859 860 do {
860 861 chk = chk->p_vpprev;
861 862 ASSERT(chk != end);
862 863 if (chk == mark)
863 864 continue;
864 865 attr = hat_page_getattr(chk, P_MOD |
865 866 P_REF);
866 867 if ((attr & P_MOD) == 0)
867 868 continue;
868 869 panic("v_pages list not all clean: "
869 870 "page_t*=%p vnode=%p off=%lx "
870 871 "attr=0x%x last clean page_t*=%p\n",
871 872 (void *)chk, (void *)chk->p_vnode,
872 873 (long)chk->p_offset, attr,
873 874 (void *)pp);
874 875 } while (chk != vp->v_pages);
875 876 #endif
876 877 break;
877 878 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
878 879 /*
879 880 * Couldn't get io lock, wait until IO is done.
880 881 * Block only for sync IO since we don't want
881 882 * to block async IO.
882 883 */
883 884 mutex_exit(vphm);
884 885 page_io_wait(pp);
885 886 mutex_enter(vphm);
886 887 continue;
887 888 }
888 889 }
889 890
890 891 /*
891 892 * Skip this page if the offset is out of the desired range.
892 893 * Just move the marker and continue.
893 894 */
894 895 if (pp->p_offset < off) {
895 896 page_vpsub(&vp->v_pages, mark);
896 897 page_vpadd(where_to_move, mark);
897 898 continue;
898 899 }
899 900
900 901 /*
901 902 * If we are supposed to invalidate or free this
902 903 * page, then we need an exclusive lock.
903 904 */
904 905 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
905 906
906 907 /*
907 908 * We must acquire the page lock for all synchronous
908 909 * operations (invalidate, free and write).
909 910 */
910 911 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
911 912 /*
912 913 * If the page_lock() drops the mutex
913 914 * we must retry the loop.
914 915 */
915 916 if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
916 917 continue;
917 918
918 919 /*
919 920 * It's ok to move the marker page now.
920 921 */
921 922 page_vpsub(&vp->v_pages, mark);
922 923 page_vpadd(where_to_move, mark);
923 924 } else {
924 925
925 926 /*
926 927 * update the marker page for all remaining cases
927 928 */
928 929 page_vpsub(&vp->v_pages, mark);
929 930 page_vpadd(where_to_move, mark);
930 931
931 932 /*
932 933 * For write backs, If we can't lock the page, it's
933 934 * invalid or in the process of being destroyed. Skip
934 935 * it, assuming someone else is writing it.
935 936 */
936 937 if (!page_trylock(pp, se))
937 938 continue;
938 939 }
939 940
940 941 ASSERT(pp->p_vnode == vp);
941 942
942 943 /*
943 944 * Successfully locked the page, now figure out what to
944 945 * do with it. Free pages are easily dealt with, invalidate
945 946 * if desired or just go on to the next page.
946 947 */
947 948 if (PP_ISFREE(pp)) {
948 949 if ((flags & B_INVAL) == 0) {
949 950 page_unlock(pp);
950 951 continue;
951 952 }
952 953
953 954 /*
954 955 * Invalidate (destroy) the page.
955 956 */
956 957 mutex_exit(vphm);
957 958 page_destroy_free(pp);
958 959 mutex_enter(vphm);
959 960 continue;
960 961 }
961 962
962 963 /*
963 964 * pvn_getdirty() figures out what do do with a dirty page.
964 965 * If the page is dirty, the putapage() routine will write it
965 966 * and will kluster any other adjacent dirty pages it can.
966 967 *
967 968 * pvn_getdirty() and `(*putapage)' unlock the page.
968 969 */
969 970 mutex_exit(vphm);
970 971 if (pvn_getdirty(pp, flags)) {
971 972 error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
972 973 if (!err)
973 974 err = error;
974 975 }
975 976 mutex_enter(vphm);
976 977 }
977 978 page_vpsub(&vp->v_pages, mark);
978 979 page_vpsub(&vp->v_pages, end);
979 980
980 981 leave:
981 982 /*
982 983 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
983 984 */
984 985 mutex_exit(vphm);
985 986 kmem_cache_free(marker_cache, mark);
986 987 kmem_cache_free(marker_cache, end);
987 988 mutex_enter(&vp->v_lock);
988 989 vp->v_flag &= ~VVMLOCK;
989 990 cv_broadcast(&vp->v_cv);
990 991 mutex_exit(&vp->v_lock);
991 992 return (err);
992 993 }
993 994
994 995 /*
995 996 * Walk the vp->v_pages list, for every page call the callback function
996 997 * pointed by *page_check. If page_check returns non-zero, then mark the
997 998 * page as modified and if VMODSORT is set, move it to the end of v_pages
998 999 * list. Moving makes sense only if we have at least two pages - this also
999 1000 * avoids having v_pages temporarily being NULL after calling page_vpsub()
1000 1001 * if there was just one page.
1001 1002 */
1002 1003 void
1003 1004 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1004 1005 {
1005 1006 page_t *pp, *next, *end;
1006 1007 kmutex_t *vphm;
1007 1008 int shuffle;
1008 1009
1009 1010 vphm = page_vnode_mutex(vp);
1010 1011 mutex_enter(vphm);
1011 1012
1012 1013 if (vp->v_pages == NULL) {
1013 1014 mutex_exit(vphm);
1014 1015 return;
1015 1016 }
1016 1017
1017 1018 end = vp->v_pages->p_vpprev;
1018 1019 shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1019 1020 pp = vp->v_pages;
1020 1021
1021 1022 for (;;) {
1022 1023 next = pp->p_vpnext;
1023 1024 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1024 1025 /*
1025 1026 * hat_setmod_only() in contrast to hat_setmod() does
1026 1027 * not shuffle the pages and does not grab the mutex
1027 1028 * page_vnode_mutex. Exactly what we need.
1028 1029 */
1029 1030 hat_setmod_only(pp);
1030 1031 if (shuffle) {
1031 1032 page_vpsub(&vp->v_pages, pp);
1032 1033 ASSERT(vp->v_pages != NULL);
1033 1034 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1034 1035 pp);
1035 1036 }
1036 1037 }
1037 1038 /* Stop if we have just processed the last page. */
1038 1039 if (pp == end)
1039 1040 break;
1040 1041 pp = next;
1041 1042 }
1042 1043
1043 1044 mutex_exit(vphm);
1044 1045 }
1045 1046
1046 1047 /*
1047 1048 * Zero out zbytes worth of data. Caller should be aware that this
1048 1049 * routine may enter back into the fs layer (xxx_getpage). Locks
1049 1050 * that the xxx_getpage routine may need should not be held while
1050 1051 * calling this.
1051 1052 */
1052 1053 void
1053 1054 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1054 1055 {
1055 1056 caddr_t addr;
1056 1057
1057 1058 ASSERT(vp->v_type != VCHR);
1058 1059
1059 1060 if (vp->v_pages == NULL)
1060 1061 return;
1061 1062
1062 1063 /*
1063 1064 * zbytes may be zero but there still may be some portion of
1064 1065 * a page which needs clearing (since zbytes is a function
1065 1066 * of filesystem block size, not pagesize.)
1066 1067 */
1067 1068 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1068 1069 return;
1069 1070
1070 1071 /*
1071 1072 * We get the last page and handle the partial
1072 1073 * zeroing via kernel mappings. This will make the page
1073 1074 * dirty so that we know that when this page is written
1074 1075 * back, the zeroed information will go out with it. If
1075 1076 * the page is not currently in memory, then the kzero
1076 1077 * operation will cause it to be brought it. We use kzero
1077 1078 * instead of bzero so that if the page cannot be read in
1078 1079 * for any reason, the system will not panic. We need
1079 1080 * to zero out a minimum of the fs given zbytes, but we
1080 1081 * might also have to do more to get the entire last page.
1081 1082 */
1082 1083
↓ open down ↓ |
1050 lines elided |
↑ open up ↑ |
1083 1084 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1084 1085 panic("pvn_vptrunc zbytes");
1085 1086 addr = segmap_getmapflt(segkmap, vp, vplen,
1086 1087 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1087 1088 (void) kzero(addr + (vplen & MAXBOFFSET),
1088 1089 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1089 1090 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1090 1091 }
1091 1092
1092 1093 /*
1093 - * Handles common work of the VOP_GETPAGE routines when more than
1094 - * one page must be returned by calling a file system specific operation
1095 - * to do most of the work. Must be called with the vp already locked
1096 - * by the VOP_GETPAGE routine.
1094 + * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095 + * calling the getpage helper for each.
1097 1096 */
1098 1097 int
1099 1098 pvn_getpages(
1100 1099 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1101 1100 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1102 1101 struct vnode *vp,
1103 1102 u_offset_t off,
1104 1103 size_t len,
1105 1104 uint_t *protp,
1106 1105 page_t *pl[],
1107 1106 size_t plsz,
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
1108 1107 struct seg *seg,
1109 1108 caddr_t addr,
1110 1109 enum seg_rw rw,
1111 1110 struct cred *cred)
1112 1111 {
1113 1112 page_t **ppp;
1114 1113 u_offset_t o, eoff;
1115 1114 size_t sz, xlen;
1116 1115 int err;
1117 1116
1118 - ASSERT(plsz >= len); /* insure that we have enough space */
1117 + /* ensure that we have enough space */
1118 + ASSERT(pl == NULL || plsz >= len);
1119 1119
1120 1120 /*
1121 1121 * Loop one page at a time and let getapage function fill
1122 1122 * in the next page in array. We only allow one page to be
1123 1123 * returned at a time (except for the last page) so that we
1124 1124 * don't have any problems with duplicates and other such
1125 1125 * painful problems. This is a very simple minded algorithm,
1126 1126 * but it does the job correctly. We hope that the cost of a
1127 1127 * getapage call for a resident page that we might have been
1128 1128 * able to get from an earlier call doesn't cost too much.
1129 1129 */
1130 1130 ppp = pl;
1131 - sz = PAGESIZE;
1131 + sz = (pl != NULL) ? PAGESIZE : 0;
1132 1132 eoff = off + len;
1133 1133 xlen = len;
1134 1134 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135 1135 xlen -= PAGESIZE) {
1136 - if (o + PAGESIZE >= eoff) {
1136 + if (o + PAGESIZE >= eoff && pl != NULL) {
1137 1137 /*
1138 1138 * Last time through - allow the all of
1139 1139 * what's left of the pl[] array to be used.
1140 1140 */
1141 1141 sz = plsz - (o - off);
1142 1142 }
1143 1143 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144 1144 rw, cred);
1145 1145 if (err) {
1146 1146 /*
1147 1147 * Release any pages we already got.
1148 1148 */
1149 1149 if (o > off && pl != NULL) {
1150 1150 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151 1151 (void) page_release(*ppp, 1);
1152 1152 }
1153 1153 break;
1154 1154 }
1155 1155 if (pl != NULL)
1156 1156 ppp++;
1157 1157 }
1158 1158 return (err);
1159 1159 }
1160 1160
1161 1161 /*
1162 1162 * Initialize the page list array.
1163 1163 */
1164 1164 /*ARGSUSED*/
1165 1165 void
1166 1166 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167 1167 u_offset_t off, size_t io_len, enum seg_rw rw)
1168 1168 {
1169 1169 ssize_t sz;
1170 1170 page_t *ppcur, **ppp;
1171 1171
1172 1172 /*
1173 1173 * Set up to load plsz worth
1174 1174 * starting at the needed page.
1175 1175 */
1176 1176 while (pp != NULL && pp->p_offset != off) {
1177 1177 /*
1178 1178 * Remove page from the i/o list,
1179 1179 * release the i/o and the page lock.
1180 1180 */
1181 1181 ppcur = pp;
1182 1182 page_sub(&pp, ppcur);
1183 1183 page_io_unlock(ppcur);
1184 1184 (void) page_release(ppcur, 1);
1185 1185 }
1186 1186
1187 1187 if (pp == NULL) {
1188 1188 pl[0] = NULL;
1189 1189 return;
1190 1190 }
1191 1191
1192 1192 sz = plsz;
1193 1193
1194 1194 /*
1195 1195 * Initialize the page list array.
1196 1196 */
1197 1197 ppp = pl;
1198 1198 do {
1199 1199 ppcur = pp;
1200 1200 *ppp++ = ppcur;
1201 1201 page_sub(&pp, ppcur);
1202 1202 page_io_unlock(ppcur);
1203 1203 if (rw != S_CREATE)
1204 1204 page_downgrade(ppcur);
1205 1205 sz -= PAGESIZE;
1206 1206 } while (sz > 0 && pp != NULL);
1207 1207 *ppp = NULL; /* terminate list */
1208 1208
1209 1209 /*
1210 1210 * Now free the remaining pages that weren't
1211 1211 * loaded in the page list.
1212 1212 */
1213 1213 while (pp != NULL) {
1214 1214 ppcur = pp;
1215 1215 page_sub(&pp, ppcur);
1216 1216 page_io_unlock(ppcur);
1217 1217 (void) page_release(ppcur, 1);
1218 1218 }
1219 1219 }
↓ open down ↓ |
73 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX