5384-pvn_getpages-may-assert-in-valid-scenarios Wdiff usr/src/uts/common/vm/vm_pvn.c

Print this page

5384 pvn_getpages may assert in valid scenarios

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_pvn.c
          +++ new/usr/src/uts/common/vm/vm_pvn.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  23   24   */
  24   25  
  25   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26   27  /*        All Rights Reserved   */
  27   28  
  28   29  /*
  29   30   * University Copyright- Copyright (c) 1982, 1986, 1988
  30   31   * The Regents of the University of California
  31   32   * All Rights Reserved
  32   33   *

  33   34   * University Acknowledgment- Portions of this document are derived from
  34   35   * software developed by the University of California, Berkeley, and its
  35   36   * contributors.
  36   37   */
  37   38  
  38   39  /*
  39   40   * VM - paged vnode.
  40   41   *
  41   42   * This file supplies vm support for the vnode operations that deal with pages.
  42   43   */
  43   44  #include <sys/types.h>
  44   45  #include <sys/t_lock.h>
  45   46  #include <sys/param.h>
  46   47  #include <sys/sysmacros.h>
  47   48  #include <sys/systm.h>
  48   49  #include <sys/time.h>
  49   50  #include <sys/buf.h>
  50   51  #include <sys/vnode.h>
  51   52  #include <sys/uio.h>
  52   53  #include <sys/vmsystm.h>
  53   54  #include <sys/mman.h>
  54   55  #include <sys/vfs.h>
  55   56  #include <sys/cred.h>
  56   57  #include <sys/user.h>
  57   58  #include <sys/kmem.h>
  58   59  #include <sys/cmn_err.h>
  59   60  #include <sys/debug.h>
  60   61  #include <sys/cpuvar.h>
  61   62  #include <sys/vtrace.h>
  62   63  #include <sys/tnf_probe.h>
  63   64  
  64   65  #include <vm/hat.h>
  65   66  #include <vm/as.h>
  66   67  #include <vm/seg.h>
  67   68  #include <vm/rm.h>
  68   69  #include <vm/pvn.h>
  69   70  #include <vm/page.h>
  70   71  #include <vm/seg_map.h>
  71   72  #include <vm/seg_kmem.h>
  72   73  #include <sys/fs/swapnode.h>
  73   74  
  74   75  int pvn_nofodklust = 0;
  75   76  int pvn_write_noklust = 0;
  76   77  
  77   78  uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  78   79  uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  79   80                                          /* support for vmodsort for testing */
  80   81  
  81   82  static struct kmem_cache *marker_cache = NULL;
  82   83  
  83   84  /*
  84   85   * Find the largest contiguous block which contains `addr' for file offset
  85   86   * `offset' in it while living within the file system block sizes (`vp_off'
  86   87   * and `vp_len') and the address space limits for which no pages currently
  87   88   * exist and which map to consecutive file offsets.
  88   89   */
  89   90  page_t *
  90   91  pvn_read_kluster(
  91   92          struct vnode *vp,
  92   93          u_offset_t off,
  93   94          struct seg *seg,
  94   95          caddr_t addr,
  95   96          u_offset_t *offp,                       /* return values */
  96   97          size_t *lenp,                           /* return values */
  97   98          u_offset_t vp_off,
  98   99          size_t vp_len,
  99  100          int isra)
 100  101  {
 101  102          ssize_t deltaf, deltab;
 102  103          page_t *pp;
 103  104          page_t *plist = NULL;
 104  105          spgcnt_t pagesavail;
 105  106          u_offset_t vp_end;
 106  107  
 107  108          ASSERT(off >= vp_off && off < vp_off + vp_len);
 108  109  
 109  110          /*
 110  111           * We only want to do klustering/read ahead if there
 111  112           * is more than minfree pages currently available.
 112  113           */
 113  114          pagesavail = freemem - minfree;
 114  115  
 115  116          if (pagesavail <= 0)
 116  117                  if (isra)
 117  118                          return ((page_t *)NULL);    /* ra case - give up */
 118  119                  else
 119  120                          pagesavail = 1;             /* must return a page */
 120  121  
 121  122          /* We calculate in pages instead of bytes due to 32-bit overflows */
 122  123          if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 123  124                  /*
 124  125                   * Don't have enough free memory for the
 125  126                   * max request, try sizing down vp request.
 126  127                   */
 127  128                  deltab = (ssize_t)(off - vp_off);
 128  129                  vp_len -= deltab;
 129  130                  vp_off += deltab;
 130  131                  if (pagesavail < btopr(vp_len)) {
 131  132                          /*
 132  133                           * Still not enough memory, just settle for
 133  134                           * pagesavail which is at least 1.
 134  135                           */
 135  136                          vp_len = ptob(pagesavail);
 136  137                  }
 137  138          }
 138  139  
 139  140          vp_end = vp_off + vp_len;
 140  141          ASSERT(off >= vp_off && off < vp_end);
 141  142  
 142  143          if (isra && SEGOP_KLUSTER(seg, addr, 0))
 143  144                  return ((page_t *)NULL);        /* segment driver says no */
 144  145  
 145  146          if ((plist = page_create_va(vp, off,
 146  147              PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 147  148                  return ((page_t *)NULL);
 148  149  
 149  150          if (vp_len <= PAGESIZE || pvn_nofodklust) {
 150  151                  *offp = off;
 151  152                  *lenp = MIN(vp_len, PAGESIZE);
 152  153          } else {
 153  154                  /*
 154  155                   * Scan back from front by incrementing "deltab" and
 155  156                   * comparing "off" with "vp_off + deltab" to avoid
 156  157                   * "signed" versus "unsigned" conversion problems.
 157  158                   */
 158  159                  for (deltab = PAGESIZE; off >= vp_off + deltab;
 159  160                      deltab += PAGESIZE) {
 160  161                          /*
 161  162                           * Call back to the segment driver to verify that
 162  163                           * the klustering/read ahead operation makes sense.
 163  164                           */
 164  165                          if (SEGOP_KLUSTER(seg, addr, -deltab))
 165  166                                  break;          /* page not eligible */
 166  167                          if ((pp = page_create_va(vp, off - deltab,
 167  168                              PAGESIZE, PG_EXCL, seg, addr - deltab))
 168  169                              == NULL)
 169  170                                  break;          /* already have the page */
 170  171                          /*
 171  172                           * Add page to front of page list.
 172  173                           */
 173  174                          page_add(&plist, pp);
 174  175                  }
 175  176                  deltab -= PAGESIZE;
 176  177  
 177  178                  /* scan forward from front */
 178  179                  for (deltaf = PAGESIZE; off + deltaf < vp_end;
 179  180                      deltaf += PAGESIZE) {
 180  181                          /*
 181  182                           * Call back to the segment driver to verify that
 182  183                           * the klustering/read ahead operation makes sense.
 183  184                           */
 184  185                          if (SEGOP_KLUSTER(seg, addr, deltaf))
 185  186                                  break;          /* page not file extension */
 186  187                          if ((pp = page_create_va(vp, off + deltaf,
 187  188                              PAGESIZE, PG_EXCL, seg, addr + deltaf))
 188  189                              == NULL)
 189  190                                  break;          /* already have page */
 190  191  
 191  192                          /*
 192  193                           * Add page to end of page list.
 193  194                           */
 194  195                          page_add(&plist, pp);
 195  196                          plist = plist->p_next;
 196  197                  }
 197  198                  *offp = off = off - deltab;
 198  199                  *lenp = deltab + deltaf;
 199  200                  ASSERT(off >= vp_off);
 200  201  
 201  202                  /*
 202  203                   * If we ended up getting more than was actually
 203  204                   * requested, retract the returned length to only
 204  205                   * reflect what was requested.  This might happen
 205  206                   * if we were allowed to kluster pages across a
 206  207                   * span of (say) 5 frags, and frag size is less
 207  208                   * than PAGESIZE.  We need a whole number of
 208  209                   * pages to contain those frags, but the returned
 209  210                   * size should only allow the returned range to
 210  211                   * extend as far as the end of the frags.
 211  212                   */
 212  213                  if ((vp_off + vp_len) < (off + *lenp)) {
 213  214                          ASSERT(vp_end > off);
 214  215                          *lenp = vp_end - off;
 215  216                  }
 216  217          }
 217  218          TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 218  219              "pvn_read_kluster:seg %p addr %x isra %x",
 219  220              seg, addr, isra);
 220  221          return (plist);
 221  222  }
 222  223  
 223  224  /*
 224  225   * Handle pages for this vnode on either side of the page "pp"
 225  226   * which has been locked by the caller.  This routine will also
 226  227   * do klustering in the range [vp_off, vp_off + vp_len] up
 227  228   * until a page which is not found.  The offset and length
 228  229   * of pages included is returned in "*offp" and "*lenp".
 229  230   *
 230  231   * Returns a list of dirty locked pages all ready to be
 231  232   * written back.
 232  233   */
 233  234  page_t *
 234  235  pvn_write_kluster(
 235  236          struct vnode *vp,
 236  237          page_t *pp,
 237  238          u_offset_t *offp,               /* return values */
 238  239          size_t *lenp,                   /* return values */
 239  240          u_offset_t vp_off,
 240  241          size_t vp_len,
 241  242          int flags)
 242  243  {
 243  244          u_offset_t off;
 244  245          page_t *dirty;
 245  246          size_t deltab, deltaf;
 246  247          se_t se;
 247  248          u_offset_t vp_end;
 248  249  
 249  250          off = pp->p_offset;
 250  251  
 251  252          /*
 252  253           * Kustering should not be done if we are invalidating
 253  254           * pages since we could destroy pages that belong to
 254  255           * some other process if this is a swap vnode.
 255  256           */
 256  257          if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 257  258                  *offp = off;
 258  259                  *lenp = PAGESIZE;
 259  260                  return (pp);
 260  261          }
 261  262  
 262  263          if (flags & (B_FREE | B_INVAL))
 263  264                  se = SE_EXCL;
 264  265          else
 265  266                  se = SE_SHARED;
 266  267  
 267  268          dirty = pp;
 268  269          /*
 269  270           * Scan backwards looking for pages to kluster by incrementing
 270  271           * "deltab" and comparing "off" with "vp_off + deltab" to
 271  272           * avoid "signed" versus "unsigned" conversion problems.
 272  273           */
 273  274          for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 274  275                  pp = page_lookup_nowait(vp, off - deltab, se);
 275  276                  if (pp == NULL)
 276  277                          break;          /* page not found */
 277  278                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 278  279                          break;
 279  280                  page_add(&dirty, pp);
 280  281          }
 281  282          deltab -= PAGESIZE;
 282  283  
 283  284          vp_end = vp_off + vp_len;
 284  285          /* now scan forwards looking for pages to kluster */
 285  286          for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 286  287                  pp = page_lookup_nowait(vp, off + deltaf, se);
 287  288                  if (pp == NULL)
 288  289                          break;          /* page not found */
 289  290                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 290  291                          break;
 291  292                  page_add(&dirty, pp);
 292  293                  dirty = dirty->p_next;
 293  294          }
 294  295  
 295  296          *offp = off - deltab;
 296  297          *lenp = deltab + deltaf;
 297  298          return (dirty);
 298  299  }
 299  300  
 300  301  /*
 301  302   * Generic entry point used to release the "shared/exclusive" lock
 302  303   * and the "p_iolock" on pages after i/o is complete.
 303  304   */
 304  305  void
 305  306  pvn_io_done(page_t *plist)
 306  307  {
 307  308          page_t *pp;
 308  309  
 309  310          while (plist != NULL) {
 310  311                  pp = plist;
 311  312                  page_sub(&plist, pp);
 312  313                  page_io_unlock(pp);
 313  314                  page_unlock(pp);
 314  315          }
 315  316  }
 316  317  
 317  318  /*
 318  319   * Entry point to be used by file system getpage subr's and
 319  320   * other such routines which either want to unlock pages (B_ASYNC
 320  321   * request) or destroy a list of pages if an error occurred.
 321  322   */
 322  323  void
 323  324  pvn_read_done(page_t *plist, int flags)
 324  325  {
 325  326          page_t *pp;
 326  327  
 327  328          while (plist != NULL) {
 328  329                  pp = plist;
 329  330                  page_sub(&plist, pp);
 330  331                  page_io_unlock(pp);
 331  332                  if (flags & B_ERROR) {
 332  333                          /*LINTED: constant in conditional context*/
 333  334                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 334  335                  } else {
 335  336                          (void) page_release(pp, 0);
 336  337                  }
 337  338          }
 338  339  }
 339  340  
 340  341  /*
 341  342   * Automagic pageout.
 342  343   * When memory gets tight, start freeing pages popping out of the
 343  344   * write queue.
 344  345   */
 345  346  int     write_free = 1;
 346  347  pgcnt_t pages_before_pager = 200;       /* LMXXX */
 347  348  
 348  349  /*
 349  350   * Routine to be called when page-out's complete.
 350  351   * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 351  352   * after waiting for i/o to complete (biowait) to free the list of
 352  353   * pages associated with the buffer.  These pages must be locked
 353  354   * before i/o is initiated.
 354  355   *
 355  356   * If a write error occurs, the pages are marked as modified
 356  357   * so the write will be re-tried later.
 357  358   */
 358  359  
 359  360  void
 360  361  pvn_write_done(page_t *plist, int flags)
 361  362  {
 362  363          int dfree = 0;
 363  364          int pgrec = 0;
 364  365          int pgout = 0;
 365  366          int pgpgout = 0;
 366  367          int anonpgout = 0;
 367  368          int anonfree = 0;
 368  369          int fspgout = 0;
 369  370          int fsfree = 0;
 370  371          int execpgout = 0;
 371  372          int execfree = 0;
 372  373          page_t *pp;
 373  374          struct cpu *cpup;
 374  375          struct vnode *vp = NULL;        /* for probe */
 375  376          uint_t ppattr;
 376  377          kmutex_t *vphm = NULL;
 377  378  
 378  379          ASSERT((flags & B_READ) == 0);
 379  380  
 380  381          /*
 381  382           * If we are about to start paging anyway, start freeing pages.
 382  383           */
 383  384          if (write_free && freemem < lotsfree + pages_before_pager &&
 384  385              (flags & B_ERROR) == 0) {
 385  386                  flags |= B_FREE;
 386  387          }
 387  388  
 388  389          /*
 389  390           * Handle each page involved in the i/o operation.
 390  391           */
 391  392          while (plist != NULL) {
 392  393                  pp = plist;
 393  394                  ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 394  395                  page_sub(&plist, pp);
 395  396  
 396  397                  /* Kernel probe support */
 397  398                  if (vp == NULL)
 398  399                          vp = pp->p_vnode;
 399  400  
 400  401                  if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 401  402                          /*
 402  403                           * Move page to the top of the v_page list.
 403  404                           * Skip pages modified during IO.
 404  405                           */
 405  406                          vphm = page_vnode_mutex(vp);
 406  407                          mutex_enter(vphm);
 407  408                          if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 408  409                                  page_vpsub(&vp->v_pages, pp);
 409  410                                  page_vpadd(&vp->v_pages, pp);
 410  411                          }
 411  412                          mutex_exit(vphm);
 412  413                  }
 413  414  
 414  415                  if (flags & B_ERROR) {
 415  416                          /*
 416  417                           * Write operation failed.  We don't want
 417  418                           * to destroy (or free) the page unless B_FORCE
 418  419                           * is set. We set the mod bit again and release
 419  420                           * all locks on the page so that it will get written
 420  421                           * back again later when things are hopefully
 421  422                           * better again.
 422  423                           * If B_INVAL and B_FORCE is set we really have
 423  424                           * to destroy the page.
 424  425                           */
 425  426                          if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 426  427                                  page_io_unlock(pp);
 427  428                                  /*LINTED: constant in conditional context*/
 428  429                                  VN_DISPOSE(pp, B_INVAL, 0, kcred);
 429  430                          } else {
 430  431                                  hat_setmod_only(pp);
 431  432                                  page_io_unlock(pp);
 432  433                                  page_unlock(pp);
 433  434                          }
 434  435                  } else if (flags & B_INVAL) {
 435  436                          /*
 436  437                           * XXX - Failed writes with B_INVAL set are
 437  438                           * not handled appropriately.
 438  439                           */
 439  440                          page_io_unlock(pp);
 440  441                          /*LINTED: constant in conditional context*/
 441  442                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 442  443                  } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 443  444                          /*
 444  445                           * Update statistics for pages being paged out
 445  446                           */
 446  447                          if (pp->p_vnode) {
 447  448                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 448  449                                          anonpgout++;
 449  450                                  } else {
 450  451                                          if (pp->p_vnode->v_flag & VVMEXEC) {
 451  452                                                  execpgout++;
 452  453                                          } else {
 453  454                                                  fspgout++;
 454  455                                          }
 455  456                                  }
 456  457                          }
 457  458                          page_io_unlock(pp);
 458  459                          pgout = 1;
 459  460                          pgpgout++;
 460  461                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 461  462                              "page_ws_out:pp %p", pp);
 462  463  
 463  464                          /*
 464  465                           * The page_struct_lock need not be acquired to
 465  466                           * examine "p_lckcnt" and "p_cowcnt" since we'll
 466  467                           * have an "exclusive" lock if the upgrade succeeds.
 467  468                           */
 468  469                          if (page_tryupgrade(pp) &&
 469  470                              pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 470  471                                  /*
 471  472                                   * Check if someone has reclaimed the
 472  473                                   * page.  If ref and mod are not set, no
 473  474                                   * one is using it so we can free it.
 474  475                                   * The rest of the system is careful
 475  476                                   * to use the NOSYNC flag to unload
 476  477                                   * translations set up for i/o w/o
 477  478                                   * affecting ref and mod bits.
 478  479                                   *
 479  480                                   * Obtain a copy of the real hardware
 480  481                                   * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 481  482                                   * to avoid having to flush the cache.
 482  483                                   */
 483  484                                  ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 484  485                                      HAT_SYNC_STOPON_MOD);
 485  486                          ck_refmod:
 486  487                                  if (!(ppattr & (P_REF | P_MOD))) {
 487  488                                          if (hat_page_is_mapped(pp)) {
 488  489                                                  /*
 489  490                                                   * Doesn't look like the page
 490  491                                                   * was modified so now we
 491  492                                                   * really have to unload the
 492  493                                                   * translations.  Meanwhile
 493  494                                                   * another CPU could've
 494  495                                                   * modified it so we have to
 495  496                                                   * check again.  We don't loop
 496  497                                                   * forever here because now
 497  498                                                   * the translations are gone
 498  499                                                   * and no one can get a new one
 499  500                                                   * since we have the "exclusive"
 500  501                                                   * lock on the page.
 501  502                                                   */
 502  503                                                  (void) hat_pageunload(pp,
 503  504                                                      HAT_FORCE_PGUNLOAD);
 504  505                                                  ppattr = hat_page_getattr(pp,
 505  506                                                      P_REF | P_MOD);
 506  507                                                  goto ck_refmod;
 507  508                                          }
 508  509                                          /*
 509  510                                           * Update statistics for pages being
 510  511                                           * freed
 511  512                                           */
 512  513                                          if (pp->p_vnode) {
 513  514                                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 514  515                                                          anonfree++;
 515  516                                                  } else {
 516  517                                                          if (pp->p_vnode->v_flag
 517  518                                                              & VVMEXEC) {
 518  519                                                                  execfree++;
 519  520                                                          } else {
 520  521                                                                  fsfree++;
 521  522                                                          }
 522  523                                                  }
 523  524                                          }
 524  525                                          /*LINTED: constant in conditional ctx*/
 525  526                                          VN_DISPOSE(pp, B_FREE,
 526  527                                              (flags & B_DONTNEED), kcred);
 527  528                                          dfree++;
 528  529                                  } else {
 529  530                                          page_unlock(pp);
 530  531                                          pgrec++;
 531  532                                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 532  533                                              "page_ws_free:pp %p", pp);
 533  534                                  }
 534  535                          } else {
 535  536                                  /*
 536  537                                   * Page is either `locked' in memory
 537  538                                   * or was reclaimed and now has a
 538  539                                   * "shared" lock, so release it.
 539  540                                   */
 540  541                                  page_unlock(pp);
 541  542                          }
 542  543                  } else {
 543  544                          /*
 544  545                           * Neither B_FREE nor B_INVAL nor B_ERROR.
 545  546                           * Just release locks.
 546  547                           */
 547  548                          page_io_unlock(pp);
 548  549                          page_unlock(pp);
 549  550                  }
 550  551          }
 551  552  
 552  553          CPU_STATS_ENTER_K();
 553  554          cpup = CPU;             /* get cpup now that CPU cannot change */
 554  555          CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 555  556          CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 556  557          CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 557  558          CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 558  559          CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 559  560          CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 560  561          CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 561  562          CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 562  563          CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 563  564          CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 564  565          CPU_STATS_EXIT_K();
 565  566  
 566  567          /* Kernel probe */
 567  568          TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 568  569              tnf_opaque, vnode,                  vp,
 569  570              tnf_ulong,  pages_pageout,          pgpgout,
 570  571              tnf_ulong,  pages_freed,            dfree,
 571  572              tnf_ulong,  pages_reclaimed,        pgrec);
 572  573  }
 573  574  
 574  575  /*
 575  576   * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 576  577   * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 577  578   * operation and is only to be considered if it doesn't involve any
 578  579   * waiting here.  B_TRUNC indicates that the file is being truncated
 579  580   * and so no i/o needs to be done. B_FORCE indicates that the page
 580  581   * must be destroyed so don't try wrting it out.
 581  582   *
 582  583   * The caller must ensure that the page is locked.  Returns 1, if
 583  584   * the page should be written back (the "iolock" is held in this
 584  585   * case), or 0 if the page has been dealt with or has been
 585  586   * unlocked.
 586  587   */
 587  588  int
 588  589  pvn_getdirty(page_t *pp, int flags)
 589  590  {
 590  591          ASSERT((flags & (B_INVAL | B_FREE)) ?
 591  592              PAGE_EXCL(pp) : PAGE_SHARED(pp));
 592  593          ASSERT(PP_ISFREE(pp) == 0);
 593  594  
 594  595          /*
 595  596           * If trying to invalidate or free a logically `locked' page,
 596  597           * forget it.  Don't need page_struct_lock to check p_lckcnt and
 597  598           * p_cowcnt as the page is exclusively locked.
 598  599           */
 599  600          if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 600  601              (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 601  602                  page_unlock(pp);
 602  603                  return (0);
 603  604          }
 604  605  
 605  606          /*
 606  607           * Now acquire the i/o lock so we can add it to the dirty
 607  608           * list (if necessary).  We avoid blocking on the i/o lock
 608  609           * in the following cases:
 609  610           *
 610  611           *      If B_DELWRI is set, which implies that this request is
 611  612           *      due to a klustering operartion.
 612  613           *
 613  614           *      If this is an async (B_ASYNC) operation and we are not doing
 614  615           *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 615  616           *      that the the page is written out].
 616  617           */
 617  618          if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 618  619                  if (!page_io_trylock(pp)) {
 619  620                          page_unlock(pp);
 620  621                          return (0);
 621  622                  }
 622  623          } else {
 623  624                  page_io_lock(pp);
 624  625          }
 625  626  
 626  627          /*
 627  628           * If we want to free or invalidate the page then
 628  629           * we need to unload it so that anyone who wants
 629  630           * it will have to take a minor fault to get it.
 630  631           * Otherwise, we're just writing the page back so we
 631  632           * need to sync up the hardwre and software mod bit to
 632  633           * detect any future modifications.  We clear the
 633  634           * software mod bit when we put the page on the dirty
 634  635           * list.
 635  636           */
 636  637          if (flags & (B_INVAL | B_FREE)) {
 637  638                  (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 638  639          } else {
 639  640                  (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 640  641          }
 641  642  
 642  643          if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 643  644                  /*
 644  645                   * Don't need to add it to the
 645  646                   * list after all.
 646  647                   */
 647  648                  page_io_unlock(pp);
 648  649                  if (flags & B_INVAL) {
 649  650                          /*LINTED: constant in conditional context*/
 650  651                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 651  652                  } else if (flags & B_FREE) {
 652  653                          /*LINTED: constant in conditional context*/
 653  654                          VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 654  655                  } else {
 655  656                          /*
 656  657                           * This is advisory path for the callers
 657  658                           * of VOP_PUTPAGE() who prefer freeing the
 658  659                           * page _only_ if no one else is accessing it.
 659  660                           * E.g. segmap_release()
 660  661                           *
 661  662                           * The above hat_ismod() check is useless because:
 662  663                           * (1) we may not be holding SE_EXCL lock;
 663  664                           * (2) we've not unloaded _all_ translations
 664  665                           *
 665  666                           * Let page_release() do the heavy-lifting.
 666  667                           */
 667  668                          (void) page_release(pp, 1);
 668  669                  }
 669  670                  return (0);
 670  671          }
 671  672  
 672  673          /*
 673  674           * Page is dirty, get it ready for the write back
 674  675           * and add page to the dirty list.
 675  676           */
 676  677          hat_clrrefmod(pp);
 677  678  
 678  679          /*
 679  680           * If we're going to free the page when we're done
 680  681           * then we can let others try to use it starting now.
 681  682           * We'll detect the fact that they used it when the
 682  683           * i/o is done and avoid freeing the page.
 683  684           */
 684  685          if (flags & B_FREE)
 685  686                  page_downgrade(pp);
 686  687  
 687  688  
 688  689          TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 689  690  
 690  691          return (1);
 691  692  }
 692  693  
 693  694  
 694  695  /*ARGSUSED*/
 695  696  static int
 696  697  marker_constructor(void *buf, void *cdrarg, int kmflags)
 697  698  {
 698  699          page_t *mark = buf;
 699  700          bzero(mark, sizeof (page_t));
 700  701          mark->p_hash = PVN_VPLIST_HASH_TAG;
 701  702          return (0);
 702  703  }
 703  704  
 704  705  void
 705  706  pvn_init()
 706  707  {
 707  708          if (pvn_vmodsort_disable == 0)
 708  709                  pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 709  710          marker_cache = kmem_cache_create("marker_cache",
 710  711              sizeof (page_t), 0, marker_constructor,
 711  712              NULL, NULL, NULL, NULL, 0);
 712  713  }
 713  714  
 714  715  
 715  716  /*
 716  717   * Process a vnode's page list for all pages whose offset is >= off.
 717  718   * Pages are to either be free'd, invalidated, or written back to disk.
 718  719   *
 719  720   * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 720  721   * is specified, otherwise they are "shared" locked.
 721  722   *
 722  723   * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 723  724   *
 724  725   * Special marker page_t's are inserted in the list in order
 725  726   * to keep track of where we are in the list when locks are dropped.
 726  727   *
 727  728   * Note the list is circular and insertions can happen only at the
 728  729   * head and tail of the list. The algorithm ensures visiting all pages
 729  730   * on the list in the following way:
 730  731   *
 731  732   *    Drop two marker pages at the end of the list.
 732  733   *
 733  734   *    Move one marker page backwards towards the start of the list until
 734  735   *    it is at the list head, processing the pages passed along the way.
 735  736   *
 736  737   *    Due to race conditions when the vphm mutex is dropped, additional pages
 737  738   *    can be added to either end of the list, so we'll continue to move
 738  739   *    the marker and process pages until it is up against the end marker.
 739  740   *
 740  741   * There is one special exit condition. If we are processing a VMODSORT
 741  742   * vnode and only writing back modified pages, we can stop as soon as
 742  743   * we run into an unmodified page.  This makes fsync(3) operations fast.
 743  744   */
 744  745  int
 745  746  pvn_vplist_dirty(
 746  747          vnode_t         *vp,
 747  748          u_offset_t      off,
 748  749          int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 749  750                          size_t *, int, cred_t *),
 750  751          int             flags,
 751  752          cred_t          *cred)
 752  753  {
 753  754          page_t          *pp;
 754  755          page_t          *mark;          /* marker page that moves toward head */
 755  756          page_t          *end;           /* marker page at end of list */
 756  757          int             err = 0;
 757  758          int             error;
 758  759          kmutex_t        *vphm;
 759  760          se_t            se;
 760  761          page_t          **where_to_move;
 761  762  
 762  763          ASSERT(vp->v_type != VCHR);
 763  764  
 764  765          if (vp->v_pages == NULL)
 765  766                  return (0);
 766  767  
 767  768  
 768  769          /*
 769  770           * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 770  771           *
 771  772           * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 772  773           * from getting blocked while flushing pages to a dead NFS server.
 773  774           */
 774  775          mutex_enter(&vp->v_lock);
 775  776          if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 776  777                  mutex_exit(&vp->v_lock);
 777  778                  return (EAGAIN);
 778  779          }
 779  780  
 780  781          while (vp->v_flag & VVMLOCK)
 781  782                  cv_wait(&vp->v_cv, &vp->v_lock);
 782  783  
 783  784          if (vp->v_pages == NULL) {
 784  785                  mutex_exit(&vp->v_lock);
 785  786                  return (0);
 786  787          }
 787  788  
 788  789          vp->v_flag |= VVMLOCK;
 789  790          mutex_exit(&vp->v_lock);
 790  791  
 791  792  
 792  793          /*
 793  794           * Set up the marker pages used to walk the list
 794  795           */
 795  796          end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 796  797          end->p_vnode = vp;
 797  798          end->p_offset = (u_offset_t)-2;
 798  799          mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 799  800          mark->p_vnode = vp;
 800  801          mark->p_offset = (u_offset_t)-1;
 801  802  
 802  803          /*
 803  804           * Grab the lock protecting the vnode's page list
 804  805           * note that this lock is dropped at times in the loop.
 805  806           */
 806  807          vphm = page_vnode_mutex(vp);
 807  808          mutex_enter(vphm);
 808  809          if (vp->v_pages == NULL)
 809  810                  goto leave;
 810  811  
 811  812          /*
 812  813           * insert the markers and loop through the list of pages
 813  814           */
 814  815          page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 815  816          page_vpadd(&mark->p_vpnext, end);
 816  817          for (;;) {
 817  818  
 818  819                  /*
 819  820                   * If only doing an async write back, then we can
 820  821                   * stop as soon as we get to start of the list.
 821  822                   */
 822  823                  if (flags == B_ASYNC && vp->v_pages == mark)
 823  824                          break;
 824  825  
 825  826                  /*
 826  827                   * otherwise stop when we've gone through all the pages
 827  828                   */
 828  829                  if (mark->p_vpprev == end)
 829  830                          break;
 830  831  
 831  832                  pp = mark->p_vpprev;
 832  833                  if (vp->v_pages == pp)
 833  834                          where_to_move = &vp->v_pages;
 834  835                  else
 835  836                          where_to_move = &pp->p_vpprev->p_vpnext;
 836  837  
 837  838                  ASSERT(pp->p_vnode == vp);
 838  839  
 839  840                  /*
 840  841                   * If just flushing dirty pages to disk and this vnode
 841  842                   * is using a sorted list of pages, we can stop processing
 842  843                   * as soon as we find an unmodified page. Since all the
 843  844                   * modified pages are visited first.
 844  845                   */
 845  846                  if (IS_VMODSORT(vp) &&
 846  847                      !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 847  848                          if (!hat_ismod(pp) && !page_io_locked(pp)) {
 848  849  #ifdef  DEBUG
 849  850                                  /*
 850  851                                   * For debug kernels examine what should be
 851  852                                   * all the remaining clean pages, asserting
 852  853                                   * that they are not modified.
 853  854                                   */
 854  855                                  page_t  *chk = pp;
 855  856                                  int     attr;
 856  857  
 857  858                                  page_vpsub(&vp->v_pages, mark);
 858  859                                  page_vpadd(where_to_move, mark);
 859  860                                  do {
 860  861                                          chk = chk->p_vpprev;
 861  862                                          ASSERT(chk != end);
 862  863                                          if (chk == mark)
 863  864                                                  continue;
 864  865                                          attr = hat_page_getattr(chk, P_MOD |
 865  866                                              P_REF);
 866  867                                          if ((attr & P_MOD) == 0)
 867  868                                                  continue;
 868  869                                          panic("v_pages list not all clean: "
 869  870                                              "page_t*=%p vnode=%p off=%lx "
 870  871                                              "attr=0x%x last clean page_t*=%p\n",
 871  872                                              (void *)chk, (void *)chk->p_vnode,
 872  873                                              (long)chk->p_offset, attr,
 873  874                                              (void *)pp);
 874  875                                  } while (chk != vp->v_pages);
 875  876  #endif
 876  877                                  break;
 877  878                          } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 878  879                                  /*
 879  880                                   * Couldn't get io lock, wait until IO is done.
 880  881                                   * Block only for sync IO since we don't want
 881  882                                   * to block async IO.
 882  883                                   */
 883  884                                  mutex_exit(vphm);
 884  885                                  page_io_wait(pp);
 885  886                                  mutex_enter(vphm);
 886  887                                  continue;
 887  888                          }
 888  889                  }
 889  890  
 890  891                  /*
 891  892                   * Skip this page if the offset is out of the desired range.
 892  893                   * Just move the marker and continue.
 893  894                   */
 894  895                  if (pp->p_offset < off) {
 895  896                          page_vpsub(&vp->v_pages, mark);
 896  897                          page_vpadd(where_to_move, mark);
 897  898                          continue;
 898  899                  }
 899  900  
 900  901                  /*
 901  902                   * If we are supposed to invalidate or free this
 902  903                   * page, then we need an exclusive lock.
 903  904                   */
 904  905                  se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 905  906  
 906  907                  /*
 907  908                   * We must acquire the page lock for all synchronous
 908  909                   * operations (invalidate, free and write).
 909  910                   */
 910  911                  if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 911  912                          /*
 912  913                           * If the page_lock() drops the mutex
 913  914                           * we must retry the loop.
 914  915                           */
 915  916                          if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 916  917                                  continue;
 917  918  
 918  919                          /*
 919  920                           * It's ok to move the marker page now.
 920  921                           */
 921  922                          page_vpsub(&vp->v_pages, mark);
 922  923                          page_vpadd(where_to_move, mark);
 923  924                  } else {
 924  925  
 925  926                          /*
 926  927                           * update the marker page for all remaining cases
 927  928                           */
 928  929                          page_vpsub(&vp->v_pages, mark);
 929  930                          page_vpadd(where_to_move, mark);
 930  931  
 931  932                          /*
 932  933                           * For write backs, If we can't lock the page, it's
 933  934                           * invalid or in the process of being destroyed.  Skip
 934  935                           * it, assuming someone else is writing it.
 935  936                           */
 936  937                          if (!page_trylock(pp, se))
 937  938                                  continue;
 938  939                  }
 939  940  
 940  941                  ASSERT(pp->p_vnode == vp);
 941  942  
 942  943                  /*
 943  944                   * Successfully locked the page, now figure out what to
 944  945                   * do with it. Free pages are easily dealt with, invalidate
 945  946                   * if desired or just go on to the next page.
 946  947                   */
 947  948                  if (PP_ISFREE(pp)) {
 948  949                          if ((flags & B_INVAL) == 0) {
 949  950                                  page_unlock(pp);
 950  951                                  continue;
 951  952                          }
 952  953  
 953  954                          /*
 954  955                           * Invalidate (destroy) the page.
 955  956                           */
 956  957                          mutex_exit(vphm);
 957  958                          page_destroy_free(pp);
 958  959                          mutex_enter(vphm);
 959  960                          continue;
 960  961                  }
 961  962  
 962  963                  /*
 963  964                   * pvn_getdirty() figures out what do do with a dirty page.
 964  965                   * If the page is dirty, the putapage() routine will write it
 965  966                   * and will kluster any other adjacent dirty pages it can.
 966  967                   *
 967  968                   * pvn_getdirty() and `(*putapage)' unlock the page.
 968  969                   */
 969  970                  mutex_exit(vphm);
 970  971                  if (pvn_getdirty(pp, flags)) {
 971  972                          error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 972  973                          if (!err)
 973  974                                  err = error;
 974  975                  }
 975  976                  mutex_enter(vphm);
 976  977          }
 977  978          page_vpsub(&vp->v_pages, mark);
 978  979          page_vpsub(&vp->v_pages, end);
 979  980  
 980  981  leave:
 981  982          /*
 982  983           * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
 983  984           */
 984  985          mutex_exit(vphm);
 985  986          kmem_cache_free(marker_cache, mark);
 986  987          kmem_cache_free(marker_cache, end);
 987  988          mutex_enter(&vp->v_lock);
 988  989          vp->v_flag &= ~VVMLOCK;
 989  990          cv_broadcast(&vp->v_cv);
 990  991          mutex_exit(&vp->v_lock);
 991  992          return (err);
 992  993  }
 993  994  
 994  995  /*
 995  996   * Walk the vp->v_pages list, for every page call the callback function
 996  997   * pointed by *page_check. If page_check returns non-zero, then mark the
 997  998   * page as modified and if VMODSORT is set, move it to the end of v_pages
 998  999   * list. Moving makes sense only if we have at least two pages - this also
 999 1000   * avoids having v_pages temporarily being NULL after calling page_vpsub()
1000 1001   * if there was just one page.
1001 1002   */
1002 1003  void
1003 1004  pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1004 1005  {
1005 1006          page_t  *pp, *next, *end;
1006 1007          kmutex_t        *vphm;
1007 1008          int     shuffle;
1008 1009  
1009 1010          vphm = page_vnode_mutex(vp);
1010 1011          mutex_enter(vphm);
1011 1012  
1012 1013          if (vp->v_pages == NULL) {
1013 1014                  mutex_exit(vphm);
1014 1015                  return;
1015 1016          }
1016 1017  
1017 1018          end = vp->v_pages->p_vpprev;
1018 1019          shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1019 1020          pp = vp->v_pages;
1020 1021  
1021 1022          for (;;) {
1022 1023                  next = pp->p_vpnext;
1023 1024                  if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1024 1025                          /*
1025 1026                           * hat_setmod_only() in contrast to hat_setmod() does
1026 1027                           * not shuffle the pages and does not grab the mutex
1027 1028                           * page_vnode_mutex. Exactly what we need.
1028 1029                           */
1029 1030                          hat_setmod_only(pp);
1030 1031                          if (shuffle) {
1031 1032                                  page_vpsub(&vp->v_pages, pp);
1032 1033                                  ASSERT(vp->v_pages != NULL);
1033 1034                                  page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1034 1035                                      pp);
1035 1036                          }
1036 1037                  }
1037 1038                  /* Stop if we have just processed the last page. */
1038 1039                  if (pp == end)
1039 1040                          break;
1040 1041                  pp = next;
1041 1042          }
1042 1043  
1043 1044          mutex_exit(vphm);
1044 1045  }
1045 1046  
1046 1047  /*
1047 1048   * Zero out zbytes worth of data. Caller should be aware that this
1048 1049   * routine may enter back into the fs layer (xxx_getpage). Locks
1049 1050   * that the xxx_getpage routine may need should not be held while
1050 1051   * calling this.
1051 1052   */
1052 1053  void
1053 1054  pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1054 1055  {
1055 1056          caddr_t addr;
1056 1057  
1057 1058          ASSERT(vp->v_type != VCHR);
1058 1059  
1059 1060          if (vp->v_pages == NULL)
1060 1061                  return;
1061 1062  
1062 1063          /*
1063 1064           * zbytes may be zero but there still may be some portion of
1064 1065           * a page which needs clearing (since zbytes is a function
1065 1066           * of filesystem block size, not pagesize.)
1066 1067           */
1067 1068          if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1068 1069                  return;
1069 1070  
1070 1071          /*
1071 1072           * We get the last page and handle the partial
1072 1073           * zeroing via kernel mappings.  This will make the page
1073 1074           * dirty so that we know that when this page is written
1074 1075           * back, the zeroed information will go out with it.  If
1075 1076           * the page is not currently in memory, then the kzero
1076 1077           * operation will cause it to be brought it.  We use kzero
1077 1078           * instead of bzero so that if the page cannot be read in
1078 1079           * for any reason, the system will not panic.  We need
1079 1080           * to zero out a minimum of the fs given zbytes, but we
1080 1081           * might also have to do more to get the entire last page.
1081 1082           */
1082 1083

↓ open down ↓

1050 lines elided

↑ open up ↑

1083 1084          if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1084 1085                  panic("pvn_vptrunc zbytes");
1085 1086          addr = segmap_getmapflt(segkmap, vp, vplen,
1086 1087              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1087 1088          (void) kzero(addr + (vplen & MAXBOFFSET),
1088 1089              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1089 1090          (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1090 1091  }
1091 1092  
1092 1093  /*
1093      - * Handles common work of the VOP_GETPAGE routines when more than
1094      - * one page must be returned by calling a file system specific operation
1095      - * to do most of the work.  Must be called with the vp already locked
1096      - * by the VOP_GETPAGE routine.
     1094 + * Handles common work of the VOP_GETPAGE routines by iterating page by page
     1095 + * calling the getpage helper for each.
1097 1096   */
1098 1097  int
1099 1098  pvn_getpages(
1100 1099          int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1101 1100                  size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1102 1101          struct vnode *vp,
1103 1102          u_offset_t off,
1104 1103          size_t len,
1105 1104          uint_t *protp,
1106 1105          page_t *pl[],

1107 1106          size_t plsz,

↓ open down ↓

1 lines elided

↑ open up ↑

1108 1107          struct seg *seg,
1109 1108          caddr_t addr,
1110 1109          enum seg_rw rw,
1111 1110          struct cred *cred)
1112 1111  {
1113 1112          page_t **ppp;
1114 1113          u_offset_t o, eoff;
1115 1114          size_t sz, xlen;
1116 1115          int err;
1117 1116  
1118      -        ASSERT(plsz >= len);            /* insure that we have enough space */
     1117 +        /* ensure that we have enough space */
     1118 +        ASSERT(pl == NULL || plsz >= len);
1119 1119  
1120 1120          /*
1121 1121           * Loop one page at a time and let getapage function fill
1122 1122           * in the next page in array.  We only allow one page to be
1123 1123           * returned at a time (except for the last page) so that we
1124 1124           * don't have any problems with duplicates and other such
1125 1125           * painful problems.  This is a very simple minded algorithm,
1126 1126           * but it does the job correctly.  We hope that the cost of a
1127 1127           * getapage call for a resident page that we might have been
1128 1128           * able to get from an earlier call doesn't cost too much.
1129 1129           */
1130 1130          ppp = pl;
1131      -        sz = PAGESIZE;
     1131 +        sz = (pl != NULL) ? PAGESIZE : 0;
1132 1132          eoff = off + len;
1133 1133          xlen = len;
1134 1134          for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135 1135              xlen -= PAGESIZE) {
1136      -                if (o + PAGESIZE >= eoff) {
     1136 +                if (o + PAGESIZE >= eoff && pl != NULL) {
1137 1137                          /*
1138 1138                           * Last time through - allow the all of
1139 1139                           * what's left of the pl[] array to be used.
1140 1140                           */
1141 1141                          sz = plsz - (o - off);
1142 1142                  }
1143 1143                  err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144 1144                      rw, cred);
1145 1145                  if (err) {
1146 1146                          /*

1147 1147                           * Release any pages we already got.
1148 1148                           */
1149 1149                          if (o > off && pl != NULL) {
1150 1150                                  for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151 1151                                          (void) page_release(*ppp, 1);
1152 1152                          }
1153 1153                          break;
1154 1154                  }
1155 1155                  if (pl != NULL)
1156 1156                          ppp++;
1157 1157          }
1158 1158          return (err);
1159 1159  }
1160 1160  
1161 1161  /*
1162 1162   * Initialize the page list array.
1163 1163   */
1164 1164  /*ARGSUSED*/
1165 1165  void
1166 1166  pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167 1167      u_offset_t off, size_t io_len, enum seg_rw rw)
1168 1168  {
1169 1169          ssize_t sz;
1170 1170          page_t *ppcur, **ppp;
1171 1171  
1172 1172          /*
1173 1173           * Set up to load plsz worth
1174 1174           * starting at the needed page.
1175 1175           */
1176 1176          while (pp != NULL && pp->p_offset != off) {
1177 1177                  /*
1178 1178                   * Remove page from the i/o list,
1179 1179                   * release the i/o and the page lock.
1180 1180                   */
1181 1181                  ppcur = pp;
1182 1182                  page_sub(&pp, ppcur);
1183 1183                  page_io_unlock(ppcur);
1184 1184                  (void) page_release(ppcur, 1);
1185 1185          }
1186 1186  
1187 1187          if (pp == NULL) {
1188 1188                  pl[0] = NULL;
1189 1189                  return;
1190 1190          }
1191 1191  
1192 1192          sz = plsz;
1193 1193  
1194 1194          /*
1195 1195           * Initialize the page list array.
1196 1196           */
1197 1197          ppp = pl;
1198 1198          do {
1199 1199                  ppcur = pp;
1200 1200                  *ppp++ = ppcur;
1201 1201                  page_sub(&pp, ppcur);
1202 1202                  page_io_unlock(ppcur);
1203 1203                  if (rw != S_CREATE)
1204 1204                          page_downgrade(ppcur);
1205 1205                  sz -= PAGESIZE;
1206 1206          } while (sz > 0 && pp != NULL);
1207 1207          *ppp = NULL;            /* terminate list */
1208 1208  
1209 1209          /*
1210 1210           * Now free the remaining pages that weren't
1211 1211           * loaded in the page list.
1212 1212           */
1213 1213          while (pp != NULL) {
1214 1214                  ppcur = pp;
1215 1215                  page_sub(&pp, ppcur);
1216 1216                  page_io_unlock(ppcur);
1217 1217                  (void) page_release(ppcur, 1);
1218 1218          }
1219 1219  }

↓ open down ↓

73 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX