fs/xfs/linux-2.6/xfs_aops.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_bit.h"
  20 #include "xfs_log.h"
  21 #include "xfs_inum.h"
  22 #include "xfs_sb.h"
  23 #include "xfs_ag.h"
  24 #include "xfs_dir2.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_dmapi.h"
  27 #include "xfs_mount.h"
  28 #include "xfs_bmap_btree.h"
  29 #include "xfs_alloc_btree.h"
  30 #include "xfs_ialloc_btree.h"
  31 #include "xfs_dir2_sf.h"
  32 #include "xfs_attr_sf.h"
  33 #include "xfs_dinode.h"
  34 #include "xfs_inode.h"
  35 #include "xfs_alloc.h"
  36 #include "xfs_btree.h"
  37 #include "xfs_error.h"
  38 #include "xfs_rw.h"
  39 #include "xfs_iomap.h"
  40 #include "xfs_vnodeops.h"
  41 #include <linux/mpage.h>
  42 #include <linux/pagevec.h>
  43 #include <linux/writeback.h>
  44
  45
  46 /*
  47  * Prime number of hash buckets since address is used as the key.
  48  */
  49 #define NVSYNC          37
  50 #define to_ioend_wq(v)  (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
  51 static wait_queue_head_t xfs_ioend_wq[NVSYNC];
  52
  53 void __init
  54 xfs_ioend_init(void)
  55 {
  56         int i;
  57
  58         for (i = 0; i < NVSYNC; i++)
  59                 init_waitqueue_head(&xfs_ioend_wq[i]);
  60 }
  61
  62 void
  63 xfs_ioend_wait(
  64         xfs_inode_t     *ip)
  65 {
  66         wait_queue_head_t *wq = to_ioend_wq(ip);
  67
  68         wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
  69 }
  70
  71 STATIC void
  72 xfs_ioend_wake(
  73         xfs_inode_t     *ip)
  74 {
  75         if (atomic_dec_and_test(&ip->i_iocount))
  76                 wake_up(to_ioend_wq(ip));
  77 }
  78
  79 STATIC void
  80 xfs_count_page_state(
  81         struct page             *page,
  82         int                     *delalloc,
  83         int                     *unmapped,
  84         int                     *unwritten)
  85 {
  86         struct buffer_head      *bh, *head;
  87
  88         *delalloc = *unmapped = *unwritten = 0;
  89
  90         bh = head = page_buffers(page);
  91         do {
  92                 if (buffer_uptodate(bh) && !buffer_mapped(bh))
  93                         (*unmapped) = 1;
  94                 else if (buffer_unwritten(bh))
  95                         (*unwritten) = 1;
  96                 else if (buffer_delay(bh))
  97                         (*delalloc) = 1;
  98         } while ((bh = bh->b_this_page) != head);
  99 }
 100
 101 #if defined(XFS_RW_TRACE)
 102 void
 103 xfs_page_trace(
 104         int             tag,
 105         struct inode    *inode,
 106         struct page     *page,
 107         unsigned long   pgoff)
 108 {
 109         xfs_inode_t     *ip;
 110         loff_t          isize = i_size_read(inode);
 111         loff_t          offset = page_offset(page);
 112         int             delalloc = -1, unmapped = -1, unwritten = -1;
 113
 114         if (page_has_buffers(page))
 115                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
 116
 117         ip = XFS_I(inode);
 118         if (!ip->i_rwtrace)
 119                 return;
 120
 121         ktrace_enter(ip->i_rwtrace,
 122                 (void *)((unsigned long)tag),
 123                 (void *)ip,
 124                 (void *)inode,
 125                 (void *)page,
 126                 (void *)pgoff,
 127                 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
 128                 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
 129                 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
 130                 (void *)((unsigned long)(isize & 0xffffffff)),
 131                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 132                 (void *)((unsigned long)(offset & 0xffffffff)),
 133                 (void *)((unsigned long)delalloc),
 134                 (void *)((unsigned long)unmapped),
 135                 (void *)((unsigned long)unwritten),
 136                 (void *)((unsigned long)current_pid()),
 137                 (void *)NULL);
 138 }
 139 #else
 140 #define xfs_page_trace(tag, inode, page, pgoff)
 141 #endif
 142
 143 STATIC struct block_device *
 144 xfs_find_bdev_for_inode(
 145         struct xfs_inode        *ip)
 146 {
 147         struct xfs_mount        *mp = ip->i_mount;
 148
 149         if (XFS_IS_REALTIME_INODE(ip))
 150                 return mp->m_rtdev_targp->bt_bdev;
 151         else
 152                 return mp->m_ddev_targp->bt_bdev;
 153 }
 154
 155 /*
 156  * We're now finished for good with this ioend structure.
 157  * Update the page state via the associated buffer_heads,
 158  * release holds on the inode and bio, and finally free
 159  * up memory.  Do not use the ioend after this.
 160  */
 161 STATIC void
 162 xfs_destroy_ioend(
 163         xfs_ioend_t             *ioend)
 164 {
 165         struct buffer_head      *bh, *next;
 166         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 167
 168         for (bh = ioend->io_buffer_head; bh; bh = next) {
 169                 next = bh->b_private;
 170                 bh->b_end_io(bh, !ioend->io_error);
 171         }
 172
 173         /*
 174          * Volume managers supporting multiple paths can send back ENODEV
 175          * when the final path disappears.  In this case continuing to fill
 176          * the page cache with dirty data which cannot be written out is
 177          * evil, so prevent that.
 178          */
 179         if (unlikely(ioend->io_error == -ENODEV)) {
 180                 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
 181                                       __FILE__, __LINE__);
 182         }
 183
 184         xfs_ioend_wake(ip);
 185         mempool_free(ioend, xfs_ioend_pool);
 186 }
 187
 188 /*
 189  * If the end of the current ioend is beyond the current EOF,
 190  * return the new EOF value, otherwise zero.
 191  */
 192 STATIC xfs_fsize_t
 193 xfs_ioend_new_eof(
 194         xfs_ioend_t             *ioend)
 195 {
 196         xfs_inode_t             *ip = XFS_I(ioend->io_inode);
 197         xfs_fsize_t             isize;
 198         xfs_fsize_t             bsize;
 199
 200         bsize = ioend->io_offset + ioend->io_size;
 201         isize = MAX(ip->i_size, ip->i_new_size);
 202         isize = MIN(isize, bsize);
 203         return isize > ip->i_d.di_size ? isize : 0;
 204 }
 205
 206 /*
 207  * Update on-disk file size now that data has been written to disk.  The
 208  * current in-memory file size is i_size.  If a write is beyond eof i_new_size
 209  * will be the intended file size until i_size is updated.  If this write does
 210  * not extend all the way to the valid file size then restrict this update to
 211  * the end of the write.
 212  *
 213  * This function does not block as blocking on the inode lock in IO completion
 214  * can lead to IO completion order dependency deadlocks.. If it can't get the
 215  * inode ilock it will return EAGAIN. Callers must handle this.
 216  */
 217 STATIC int
 218 xfs_setfilesize(
 219         xfs_ioend_t             *ioend)
 220 {
 221         xfs_inode_t             *ip = XFS_I(ioend->io_inode);
 222         xfs_fsize_t             isize;
 223
 224         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 225         ASSERT(ioend->io_type != IOMAP_READ);
 226
 227         if (unlikely(ioend->io_error))
 228                 return 0;
 229
 230         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 231                 return EAGAIN;
 232
 233         isize = xfs_ioend_new_eof(ioend);
 234         if (isize) {
 235                 ip->i_d.di_size = isize;
 236                 xfs_mark_inode_dirty_sync(ip);
 237         }
 238
 239         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 240         return 0;
 241 }
 242
 243 /*
 244  * Schedule IO completion handling on a xfsdatad if this was
 245  * the final hold on this ioend. If we are asked to wait,
 246  * flush the workqueue.
 247  */
 248 STATIC void
 249 xfs_finish_ioend(
 250         xfs_ioend_t     *ioend,
 251         int             wait)
 252 {
 253         if (atomic_dec_and_test(&ioend->io_remaining)) {
 254                 struct workqueue_struct *wq;
 255
 256                 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
 257                         xfsconvertd_workqueue : xfsdatad_workqueue;
 258                 queue_work(wq, &ioend->io_work);
 259                 if (wait)
 260                         flush_workqueue(wq);
 261         }
 262 }
 263
 264 /*
 265  * Buffered IO write completion for delayed allocate extents.
 266  */
 267 STATIC void
 268 xfs_end_bio_delalloc(
 269         struct work_struct      *work)
 270 {
 271         xfs_ioend_t             *ioend =
 272                 container_of(work, xfs_ioend_t, io_work);
 273         int                     error;
 274
 275         /*
 276          * If we didn't complete processing of the ioend, requeue it to the
 277          * tail of the workqueue for another attempt later. Otherwise destroy
 278          * it.
 279          */
 280         error = xfs_setfilesize(ioend);
 281         if (error == EAGAIN) {
 282                 atomic_inc(&ioend->io_remaining);
 283                 xfs_finish_ioend(ioend, 0);
 284                 /* ensure we don't spin on blocked ioends */
 285                 delay(1);
 286         } else {
 287                 ASSERT(!error);
 288                 xfs_destroy_ioend(ioend);
 289         }
 290 }
 291
 292 /*
 293  * Buffered IO write completion for regular, written extents.
 294  */
 295 STATIC void
 296 xfs_end_bio_written(
 297         struct work_struct      *work)
 298 {
 299         xfs_ioend_t             *ioend =
 300                 container_of(work, xfs_ioend_t, io_work);
 301         int                     error;
 302
 303         /*
 304          * If we didn't complete processing of the ioend, requeue it to the
 305          * tail of the workqueue for another attempt later. Otherwise destroy
 306          * it.
 307          */
 308         error = xfs_setfilesize(ioend);
 309         if (error == EAGAIN) {
 310                 atomic_inc(&ioend->io_remaining);
 311                 xfs_finish_ioend(ioend, 0);
 312                 /* ensure we don't spin on blocked ioends */
 313                 delay(1);
 314         } else {
 315                 ASSERT(!error);
 316                 xfs_destroy_ioend(ioend);
 317         }
 318 }
 319
 320 /*
 321  * IO write completion for unwritten extents.
 322  *
 323  * Issue transactions to convert a buffer range from unwritten
 324  * to written extents.
 325  */
 326 STATIC void
 327 xfs_end_bio_unwritten(
 328         struct work_struct      *work)
 329 {
 330         xfs_ioend_t             *ioend =
 331                 container_of(work, xfs_ioend_t, io_work);
 332         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 333         xfs_off_t               offset = ioend->io_offset;
 334         size_t                  size = ioend->io_size;
 335
 336         if (likely(!ioend->io_error)) {
 337                 int     error;
 338                 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 339                         error = xfs_iomap_write_unwritten(ip, offset, size);
 340                         if (error)
 341                                 ioend->io_error = error;
 342                 }
 343                 /*
 344                  * If we didn't complete processing of the ioend, requeue it to the
 345                  * tail of the workqueue for another attempt later. Otherwise destroy
 346                  * it.
 347                  */
 348                 error = xfs_setfilesize(ioend);
 349                 if (error == EAGAIN) {
 350                         atomic_inc(&ioend->io_remaining);
 351                         xfs_finish_ioend(ioend, 0);
 352                         /* ensure we don't spin on blocked ioends */
 353                         delay(1);
 354                         return;
 355                 }
 356         }
 357         xfs_destroy_ioend(ioend);
 358 }
 359
 360 /*
 361  * IO read completion for regular, written extents.
 362  */
 363 STATIC void
 364 xfs_end_bio_read(
 365         struct work_struct      *work)
 366 {
 367         xfs_ioend_t             *ioend =
 368                 container_of(work, xfs_ioend_t, io_work);
 369
 370         xfs_destroy_ioend(ioend);
 371 }
 372
 373 /*
 374  * Allocate and initialise an IO completion structure.
 375  * We need to track unwritten extent write completion here initially.
 376  * We'll need to extend this for updating the ondisk inode size later
 377  * (vs. incore size).
 378  */
 379 STATIC xfs_ioend_t *
 380 xfs_alloc_ioend(
 381         struct inode            *inode,
 382         unsigned int            type)
 383 {
 384         xfs_ioend_t             *ioend;
 385
 386         ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 387
 388         /*
 389          * Set the count to 1 initially, which will prevent an I/O
 390          * completion callback from happening before we have started
 391          * all the I/O from calling the completion routine too early.
 392          */
 393         atomic_set(&ioend->io_remaining, 1);
 394         ioend->io_error = 0;
 395         ioend->io_list = NULL;
 396         ioend->io_type = type;
 397         ioend->io_inode = inode;
 398         ioend->io_buffer_head = NULL;
 399         ioend->io_buffer_tail = NULL;
 400         atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 401         ioend->io_offset = 0;
 402         ioend->io_size = 0;
 403
 404         if (type == IOMAP_UNWRITTEN)
 405                 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
 406         else if (type == IOMAP_DELAY)
 407                 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
 408         else if (type == IOMAP_READ)
 409                 INIT_WORK(&ioend->io_work, xfs_end_bio_read);
 410         else
 411                 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
 412
 413         return ioend;
 414 }
 415
 416 STATIC int
 417 xfs_map_blocks(
 418         struct inode            *inode,
 419         loff_t                  offset,
 420         ssize_t                 count,
 421         xfs_iomap_t             *mapp,
 422         int                     flags)
 423 {
 424         int                     nmaps = 1;
 425
 426         return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
 427 }
 428
 429 STATIC_INLINE int
 430 xfs_iomap_valid(
 431         xfs_iomap_t             *iomapp,
 432         loff_t                  offset)
 433 {
 434         return offset >= iomapp->iomap_offset &&
 435                 offset < iomapp->iomap_offset + iomapp->iomap_bsize;
 436 }
 437
 438 /*
 439  * BIO completion handler for buffered IO.
 440  */
 441 STATIC void
 442 xfs_end_bio(
 443         struct bio              *bio,
 444         int                     error)
 445 {
 446         xfs_ioend_t             *ioend = bio->bi_private;
 447
 448         ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 449         ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 450
 451         /* Toss bio and pass work off to an xfsdatad thread */
 452         bio->bi_private = NULL;
 453         bio->bi_end_io = NULL;
 454         bio_put(bio);
 455
 456         xfs_finish_ioend(ioend, 0);
 457 }
 458
 459 STATIC void
 460 xfs_submit_ioend_bio(
 461         xfs_ioend_t     *ioend,
 462         struct bio      *bio)
 463 {
 464         atomic_inc(&ioend->io_remaining);
 465         bio->bi_private = ioend;
 466         bio->bi_end_io = xfs_end_bio;
 467
 468         /*
 469          * If the I/O is beyond EOF we mark the inode dirty immediately
 470          * but don't update the inode size until I/O completion.
 471          */
 472         if (xfs_ioend_new_eof(ioend))
 473                 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
 474
 475         submit_bio(WRITE, bio);
 476         ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 477         bio_put(bio);
 478 }
 479
 480 STATIC struct bio *
 481 xfs_alloc_ioend_bio(
 482         struct buffer_head      *bh)
 483 {
 484         struct bio              *bio;
 485         int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
 486
 487         do {
 488                 bio = bio_alloc(GFP_NOIO, nvecs);
 489                 nvecs >>= 1;
 490         } while (!bio);
 491
 492         ASSERT(bio->bi_private == NULL);
 493         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 494         bio->bi_bdev = bh->b_bdev;
 495         bio_get(bio);
 496         return bio;
 497 }
 498
 499 STATIC void
 500 xfs_start_buffer_writeback(
 501         struct buffer_head      *bh)
 502 {
 503         ASSERT(buffer_mapped(bh));
 504         ASSERT(buffer_locked(bh));
 505         ASSERT(!buffer_delay(bh));
 506         ASSERT(!buffer_unwritten(bh));
 507
 508         mark_buffer_async_write(bh);
 509         set_buffer_uptodate(bh);
 510         clear_buffer_dirty(bh);
 511 }
 512
 513 STATIC void
 514 xfs_start_page_writeback(
 515         struct page             *page,
 516         int                     clear_dirty,
 517         int                     buffers)
 518 {
 519         ASSERT(PageLocked(page));
 520         ASSERT(!PageWriteback(page));
 521         if (clear_dirty)
 522                 clear_page_dirty_for_io(page);
 523         set_page_writeback(page);
 524         unlock_page(page);
 525         /* If no buffers on the page are to be written, finish it here */
 526         if (!buffers)
 527                 end_page_writeback(page);
 528 }
 529
 530 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 531 {
 532         return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 533 }
 534
 535 /*
 536  * Submit all of the bios for all of the ioends we have saved up, covering the
 537  * initial writepage page and also any probed pages.
 538  *
 539  * Because we may have multiple ioends spanning a page, we need to start
 540  * writeback on all the buffers before we submit them for I/O. If we mark the
 541  * buffers as we got, then we can end up with a page that only has buffers
 542  * marked async write and I/O complete on can occur before we mark the other
 543  * buffers async write.
 544  *
 545  * The end result of this is that we trip a bug in end_page_writeback() because
 546  * we call it twice for the one page as the code in end_buffer_async_write()
 547  * assumes that all buffers on the page are started at the same time.
 548  *
 549  * The fix is two passes across the ioend list - one to start writeback on the
 550  * buffer_heads, and then submit them for I/O on the second pass.
 551  */
 552 STATIC void
 553 xfs_submit_ioend(
 554         xfs_ioend_t             *ioend)
 555 {
 556         xfs_ioend_t             *head = ioend;
 557         xfs_ioend_t             *next;
 558         struct buffer_head      *bh;
 559         struct bio              *bio;
 560         sector_t                lastblock = 0;
 561
 562         /* Pass 1 - start writeback */
 563         do {
 564                 next = ioend->io_list;
 565                 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 566                         xfs_start_buffer_writeback(bh);
 567                 }
 568         } while ((ioend = next) != NULL);
 569
 570         /* Pass 2 - submit I/O */
 571         ioend = head;
 572         do {
 573                 next = ioend->io_list;
 574                 bio = NULL;
 575
 576                 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 577
 578                         if (!bio) {
 579  retry:
 580                                 bio = xfs_alloc_ioend_bio(bh);
 581                         } else if (bh->b_blocknr != lastblock + 1) {
 582                                 xfs_submit_ioend_bio(ioend, bio);
 583                                 goto retry;
 584                         }
 585
 586                         if (bio_add_buffer(bio, bh) != bh->b_size) {
 587                                 xfs_submit_ioend_bio(ioend, bio);
 588                                 goto retry;
 589                         }
 590
 591                         lastblock = bh->b_blocknr;
 592                 }
 593                 if (bio)
 594                         xfs_submit_ioend_bio(ioend, bio);
 595                 xfs_finish_ioend(ioend, 0);
 596         } while ((ioend = next) != NULL);
 597 }
 598
 599 /*
 600  * Cancel submission of all buffer_heads so far in this endio.
 601  * Toss the endio too.  Only ever called for the initial page
 602  * in a writepage request, so only ever one page.
 603  */
 604 STATIC void
 605 xfs_cancel_ioend(
 606         xfs_ioend_t             *ioend)
 607 {
 608         xfs_ioend_t             *next;
 609         struct buffer_head      *bh, *next_bh;
 610
 611         do {
 612                 next = ioend->io_list;
 613                 bh = ioend->io_buffer_head;
 614                 do {
 615                         next_bh = bh->b_private;
 616                         clear_buffer_async_write(bh);
 617                         unlock_buffer(bh);
 618                 } while ((bh = next_bh) != NULL);
 619
 620                 xfs_ioend_wake(XFS_I(ioend->io_inode));
 621                 mempool_free(ioend, xfs_ioend_pool);
 622         } while ((ioend = next) != NULL);
 623 }
 624
 625 /*
 626  * Test to see if we've been building up a completion structure for
 627  * earlier buffers -- if so, we try to append to this ioend if we
 628  * can, otherwise we finish off any current ioend and start another.
 629  * Return true if we've finished the given ioend.
 630  */
 631 STATIC void
 632 xfs_add_to_ioend(
 633         struct inode            *inode,
 634         struct buffer_head      *bh,
 635         xfs_off_t               offset,
 636         unsigned int            type,
 637         xfs_ioend_t             **result,
 638         int                     need_ioend)
 639 {
 640         xfs_ioend_t             *ioend = *result;
 641
 642         if (!ioend || need_ioend || type != ioend->io_type) {
 643                 xfs_ioend_t     *previous = *result;
 644
 645                 ioend = xfs_alloc_ioend(inode, type);
 646                 ioend->io_offset = offset;
 647                 ioend->io_buffer_head = bh;
 648                 ioend->io_buffer_tail = bh;
 649                 if (previous)
 650                         previous->io_list = ioend;
 651                 *result = ioend;
 652         } else {
 653                 ioend->io_buffer_tail->b_private = bh;
 654                 ioend->io_buffer_tail = bh;
 655         }
 656
 657         bh->b_private = NULL;
 658         ioend->io_size += bh->b_size;
 659 }
 660
 661 STATIC void
 662 xfs_map_buffer(
 663         struct buffer_head      *bh,
 664         xfs_iomap_t             *mp,
 665         xfs_off_t               offset,
 666         uint                    block_bits)
 667 {
 668         sector_t                bn;
 669
 670         ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
 671
 672         bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
 673               ((offset - mp->iomap_offset) >> block_bits);
 674
 675         ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
 676
 677         bh->b_blocknr = bn;
 678         set_buffer_mapped(bh);
 679 }
 680
 681 STATIC void
 682 xfs_map_at_offset(
 683         struct buffer_head      *bh,
 684         loff_t                  offset,
 685         int                     block_bits,
 686         xfs_iomap_t             *iomapp)
 687 {
 688         ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
 689         ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
 690
 691         lock_buffer(bh);
 692         xfs_map_buffer(bh, iomapp, offset, block_bits);
 693         bh->b_bdev = iomapp->iomap_target->bt_bdev;
 694         set_buffer_mapped(bh);
 695         clear_buffer_delay(bh);
 696         clear_buffer_unwritten(bh);
 697 }
 698
 699 /*
 700  * Look for a page at index that is suitable for clustering.
 701  */
 702 STATIC unsigned int
 703 xfs_probe_page(
 704         struct page             *page,
 705         unsigned int            pg_offset,
 706         int                     mapped)
 707 {
 708         int                     ret = 0;
 709
 710         if (PageWriteback(page))
 711                 return 0;
 712
 713         if (page->mapping && PageDirty(page)) {
 714                 if (page_has_buffers(page)) {
 715                         struct buffer_head      *bh, *head;
 716
 717                         bh = head = page_buffers(page);
 718                         do {
 719                                 if (!buffer_uptodate(bh))
 720                                         break;
 721                                 if (mapped != buffer_mapped(bh))
 722                                         break;
 723                                 ret += bh->b_size;
 724                                 if (ret >= pg_offset)
 725                                         break;
 726                         } while ((bh = bh->b_this_page) != head);
 727                 } else
 728                         ret = mapped ? 0 : PAGE_CACHE_SIZE;
 729         }
 730
 731         return ret;
 732 }
 733
 734 STATIC size_t
 735 xfs_probe_cluster(
 736         struct inode            *inode,
 737         struct page             *startpage,
 738         struct buffer_head      *bh,
 739         struct buffer_head      *head,
 740         int                     mapped)
 741 {
 742         struct pagevec          pvec;
 743         pgoff_t                 tindex, tlast, tloff;
 744         size_t                  total = 0;
 745         int                     done = 0, i;
 746
 747         /* First sum forwards in this page */
 748         do {
 749                 if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
 750                         return total;
 751                 total += bh->b_size;
 752         } while ((bh = bh->b_this_page) != head);
 753
 754         /* if we reached the end of the page, sum forwards in following pages */
 755         tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 756         tindex = startpage->index + 1;
 757
 758         /* Prune this back to avoid pathological behavior */
 759         tloff = min(tlast, startpage->index + 64);
 760
 761         pagevec_init(&pvec, 0);
 762         while (!done && tindex <= tloff) {
 763                 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 764
 765                 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 766                         break;
 767
 768                 for (i = 0; i < pagevec_count(&pvec); i++) {
 769                         struct page *page = pvec.pages[i];
 770                         size_t pg_offset, pg_len = 0;
 771
 772                         if (tindex == tlast) {
 773                                 pg_offset =
 774                                     i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
 775                                 if (!pg_offset) {
 776                                         done = 1;
 777                                         break;
 778                                 }
 779                         } else
 780                                 pg_offset = PAGE_CACHE_SIZE;
 781
 782                         if (page->index == tindex && trylock_page(page)) {
 783                                 pg_len = xfs_probe_page(page, pg_offset, mapped);
 784                                 unlock_page(page);
 785                         }
 786
 787                         if (!pg_len) {
 788                                 done = 1;
 789                                 break;
 790                         }
 791
 792                         total += pg_len;
 793                         tindex++;
 794                 }
 795
 796                 pagevec_release(&pvec);
 797                 cond_resched();
 798         }
 799
 800         return total;
 801 }
 802
 803 /*
 804  * Test if a given page is suitable for writing as part of an unwritten
 805  * or delayed allocate extent.
 806  */
 807 STATIC int
 808 xfs_is_delayed_page(
 809         struct page             *page,
 810         unsigned int            type)
 811 {
 812         if (PageWriteback(page))
 813                 return 0;
 814
 815         if (page->mapping && page_has_buffers(page)) {
 816                 struct buffer_head      *bh, *head;
 817                 int                     acceptable = 0;
 818
 819                 bh = head = page_buffers(page);
 820                 do {
 821                         if (buffer_unwritten(bh))
 822                                 acceptable = (type == IOMAP_UNWRITTEN);
 823                         else if (buffer_delay(bh))
 824                                 acceptable = (type == IOMAP_DELAY);
 825                         else if (buffer_dirty(bh) && buffer_mapped(bh))
 826                                 acceptable = (type == IOMAP_NEW);
 827                         else
 828                                 break;
 829                 } while ((bh = bh->b_this_page) != head);
 830
 831                 if (acceptable)
 832                         return 1;
 833         }
 834
 835         return 0;
 836 }
 837
 838 /*
 839  * Allocate & map buffers for page given the extent map. Write it out.
 840  * except for the original page of a writepage, this is called on
 841  * delalloc/unwritten pages only, for the original page it is possible
 842  * that the page has no mapping at all.
 843  */
 844 STATIC int
 845 xfs_convert_page(
 846         struct inode            *inode,
 847         struct page             *page,
 848         loff_t                  tindex,
 849         xfs_iomap_t             *mp,
 850         xfs_ioend_t             **ioendp,
 851         struct writeback_control *wbc,
 852         int                     startio,
 853         int                     all_bh)
 854 {
 855         struct buffer_head      *bh, *head;
 856         xfs_off_t               end_offset;
 857         unsigned long           p_offset;
 858         unsigned int            type;
 859         int                     bbits = inode->i_blkbits;
 860         int                     len, page_dirty;
 861         int                     count = 0, done = 0, uptodate = 1;
 862         xfs_off_t               offset = page_offset(page);
 863
 864         if (page->index != tindex)
 865                 goto fail;
 866         if (!trylock_page(page))
 867                 goto fail;
 868         if (PageWriteback(page))
 869                 goto fail_unlock_page;
 870         if (page->mapping != inode->i_mapping)
 871                 goto fail_unlock_page;
 872         if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
 873                 goto fail_unlock_page;
 874
 875         /*
 876          * page_dirty is initially a count of buffers on the page before
 877          * EOF and is decremented as we move each into a cleanable state.
 878          *
 879          * Derivation:
 880          *
 881          * End offset is the highest offset that this page should represent.
 882          * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
 883          * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
 884          * hence give us the correct page_dirty count. On any other page,
 885          * it will be zero and in that case we need page_dirty to be the
 886          * count of buffers on the page.
 887          */
 888         end_offset = min_t(unsigned long long,
 889                         (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 890                         i_size_read(inode));
 891
 892         len = 1 << inode->i_blkbits;
 893         p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 894                                         PAGE_CACHE_SIZE);
 895         p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 896         page_dirty = p_offset / len;
 897
 898         bh = head = page_buffers(page);
 899         do {
 900                 if (offset >= end_offset)
 901                         break;
 902                 if (!buffer_uptodate(bh))
 903                         uptodate = 0;
 904                 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
 905                         done = 1;
 906                         continue;
 907                 }
 908
 909                 if (buffer_unwritten(bh) || buffer_delay(bh)) {
 910                         if (buffer_unwritten(bh))
 911                                 type = IOMAP_UNWRITTEN;
 912                         else
 913                                 type = IOMAP_DELAY;
 914
 915                         if (!xfs_iomap_valid(mp, offset)) {
 916                                 done = 1;
 917                                 continue;
 918                         }
 919
 920                         ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
 921                         ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
 922
 923                         xfs_map_at_offset(bh, offset, bbits, mp);
 924                         if (startio) {
 925                                 xfs_add_to_ioend(inode, bh, offset,
 926                                                 type, ioendp, done);
 927                         } else {
 928                                 set_buffer_dirty(bh);
 929                                 unlock_buffer(bh);
 930                                 mark_buffer_dirty(bh);
 931                         }
 932                         page_dirty--;
 933                         count++;
 934                 } else {
 935                         type = IOMAP_NEW;
 936                         if (buffer_mapped(bh) && all_bh && startio) {
 937                                 lock_buffer(bh);
 938                                 xfs_add_to_ioend(inode, bh, offset,
 939                                                 type, ioendp, done);
 940                                 count++;
 941                                 page_dirty--;
 942                         } else {
 943                                 done = 1;
 944                         }
 945                 }
 946         } while (offset += len, (bh = bh->b_this_page) != head);
 947
 948         if (uptodate && bh == head)
 949                 SetPageUptodate(page);
 950
 951         if (startio) {
 952                 if (count) {
 953                         struct backing_dev_info *bdi;
 954
 955                         bdi = inode->i_mapping->backing_dev_info;
 956                         wbc->nr_to_write--;
 957                         if (bdi_write_congested(bdi)) {
 958                                 wbc->encountered_congestion = 1;
 959                                 done = 1;
 960                         } else if (wbc->nr_to_write <= 0) {
 961                                 done = 1;
 962                         }
 963                 }
 964                 xfs_start_page_writeback(page, !page_dirty, count);
 965         }
 966
 967         return done;
 968  fail_unlock_page:
 969         unlock_page(page);
 970  fail:
 971         return 1;
 972 }
 973
 974 /*
 975  * Convert & write out a cluster of pages in the same extent as defined
 976  * by mp and following the start page.
 977  */
 978 STATIC void
 979 xfs_cluster_write(
 980         struct inode            *inode,
 981         pgoff_t                 tindex,
 982         xfs_iomap_t             *iomapp,
 983         xfs_ioend_t             **ioendp,
 984         struct writeback_control *wbc,
 985         int                     startio,
 986         int                     all_bh,
 987         pgoff_t                 tlast)
 988 {
 989         struct pagevec          pvec;
 990         int                     done = 0, i;
 991
 992         pagevec_init(&pvec, 0);
 993         while (!done && tindex <= tlast) {
 994                 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 995
 996                 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 997                         break;
 998
 999                 for (i = 0; i < pagevec_count(&pvec); i++) {
1000                         done = xfs_convert_page(inode, pvec.pages[i], tindex++,
1001                                         iomapp, ioendp, wbc, startio, all_bh);
1002                         if (done)
1003                                 break;
1004                 }
1005
1006                 pagevec_release(&pvec);
1007                 cond_resched();
1008         }
1009 }
1010
1011 /*
1012  * Calling this without startio set means we are being asked to make a dirty
1013  * page ready for freeing it's buffers.  When called with startio set then
1014  * we are coming from writepage.
1015  *
1016  * When called with startio set it is important that we write the WHOLE
1017  * page if possible.
1018  * The bh->b_state's cannot know if any of the blocks or which block for
1019  * that matter are dirty due to mmap writes, and therefore bh uptodate is
1020  * only valid if the page itself isn't completely uptodate.  Some layers
1021  * may clear the page dirty flag prior to calling write page, under the
1022  * assumption the entire page will be written out; by not writing out the
1023  * whole page the page can be reused before all valid dirty data is
1024  * written out.  Note: in the case of a page that has been dirty'd by
1025  * mapwrite and but partially setup by block_prepare_write the
1026  * bh->b_states's will not agree and only ones setup by BPW/BCW will have
1027  * valid state, thus the whole page must be written out thing.
1028  */
1029
1030 STATIC int
1031 xfs_page_state_convert(
1032         struct inode    *inode,
1033         struct page     *page,
1034         struct writeback_control *wbc,
1035         int             startio,
1036         int             unmapped) /* also implies page uptodate */
1037 {
1038         struct buffer_head      *bh, *head;
1039         xfs_iomap_t             iomap;
1040         xfs_ioend_t             *ioend = NULL, *iohead = NULL;
1041         loff_t                  offset;
1042         unsigned long           p_offset = 0;
1043         unsigned int            type;
1044         __uint64_t              end_offset;
1045         pgoff_t                 end_index, last_index, tlast;
1046         ssize_t                 size, len;
1047         int                     flags, err, iomap_valid = 0, uptodate = 1;
1048         int                     page_dirty, count = 0;
1049         int                     trylock = 0;
1050         int                     all_bh = unmapped;
1051
1052         if (startio) {
1053                 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
1054                         trylock |= BMAPI_TRYLOCK;
1055         }
1056
1057         /* Is this page beyond the end of the file? */
1058         offset = i_size_read(inode);
1059         end_index = offset >> PAGE_CACHE_SHIFT;
1060         last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
1061         if (page->index >= end_index) {
1062                 if ((page->index >= end_index + 1) ||
1063                     !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
1064                         if (startio)
1065                                 unlock_page(page);
1066                         return 0;
1067                 }
1068         }
1069
1070         /*
1071          * page_dirty is initially a count of buffers on the page before
1072          * EOF and is decremented as we move each into a cleanable state.
1073          *
1074          * Derivation:
1075          *
1076          * End offset is the highest offset that this page should represent.
1077          * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
1078          * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
1079          * hence give us the correct page_dirty count. On any other page,
1080          * it will be zero and in that case we need page_dirty to be the
1081          * count of buffers on the page.
1082          */
1083         end_offset = min_t(unsigned long long,
1084                         (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
1085         len = 1 << inode->i_blkbits;
1086         p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
1087                                         PAGE_CACHE_SIZE);
1088         p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
1089         page_dirty = p_offset / len;
1090
1091         bh = head = page_buffers(page);
1092         offset = page_offset(page);
1093         flags = BMAPI_READ;
1094         type = IOMAP_NEW;
1095
1096         /* TODO: cleanup count and page_dirty */
1097
1098         do {
1099                 if (offset >= end_offset)
1100                         break;
1101                 if (!buffer_uptodate(bh))
1102                         uptodate = 0;
1103                 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
1104                         /*
1105                          * the iomap is actually still valid, but the ioend
1106                          * isn't.  shouldn't happen too often.
1107                          */
1108                         iomap_valid = 0;
1109                         continue;
1110                 }
1111
1112                 if (iomap_valid)
1113                         iomap_valid = xfs_iomap_valid(&iomap, offset);
1114
1115                 /*
1116                  * First case, map an unwritten extent and prepare for
1117                  * extent state conversion transaction on completion.
1118                  *
1119                  * Second case, allocate space for a delalloc buffer.
1120                  * We can return EAGAIN here in the release page case.
1121                  *
1122                  * Third case, an unmapped buffer was found, and we are
1123                  * in a path where we need to write the whole page out.
1124                  */
1125                 if (buffer_unwritten(bh) || buffer_delay(bh) ||
1126                     ((buffer_uptodate(bh) || PageUptodate(page)) &&
1127                      !buffer_mapped(bh) && (unmapped || startio))) {
1128                         int new_ioend = 0;
1129
1130                         /*
1131                          * Make sure we don't use a read-only iomap
1132                          */
1133                         if (flags == BMAPI_READ)
1134                                 iomap_valid = 0;
1135
1136                         if (buffer_unwritten(bh)) {
1137                                 type = IOMAP_UNWRITTEN;
1138                                 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1139                         } else if (buffer_delay(bh)) {
1140                                 type = IOMAP_DELAY;
1141                                 flags = BMAPI_ALLOCATE | trylock;
1142                         } else {
1143                                 type = IOMAP_NEW;
1144                                 flags = BMAPI_WRITE | BMAPI_MMAP;
1145                         }
1146
1147                         if (!iomap_valid) {
1148                                 /*
1149                                  * if we didn't have a valid mapping then we
1150                                  * need to ensure that we put the new mapping
1151                                  * in a new ioend structure. This needs to be
1152                                  * done to ensure that the ioends correctly
1153                                  * reflect the block mappings at io completion
1154                                  * for unwritten extent conversion.
1155                                  */
1156                                 new_ioend = 1;
1157                                 if (type == IOMAP_NEW) {
1158                                         size = xfs_probe_cluster(inode,
1159                                                         page, bh, head, 0);
1160                                 } else {
1161                                         size = len;
1162                                 }
1163
1164                                 err = xfs_map_blocks(inode, offset, size,
1165                                                 &iomap, flags);
1166                                 if (err)
1167                                         goto error;
1168                                 iomap_valid = xfs_iomap_valid(&iomap, offset);
1169                         }
1170                         if (iomap_valid) {
1171                                 xfs_map_at_offset(bh, offset,
1172                                                 inode->i_blkbits, &iomap);
1173                                 if (startio) {
1174                                         xfs_add_to_ioend(inode, bh, offset,
1175                                                         type, &ioend,
1176                                                         new_ioend);
1177                                 } else {
1178                                         set_buffer_dirty(bh);
1179                                         unlock_buffer(bh);
1180                                         mark_buffer_dirty(bh);
1181                                 }
1182                                 page_dirty--;
1183                                 count++;
1184                         }
1185                 } else if (buffer_uptodate(bh) && startio) {
1186                         /*
1187                          * we got here because the buffer is already mapped.
1188                          * That means it must already have extents allocated
1189                          * underneath it. Map the extent by reading it.
1190                          */
1191                         if (!iomap_valid || flags != BMAPI_READ) {
1192                                 flags = BMAPI_READ;
1193                                 size = xfs_probe_cluster(inode, page, bh,
1194                                                                 head, 1);
1195                                 err = xfs_map_blocks(inode, offset, size,
1196                                                 &iomap, flags);
1197                                 if (err)
1198                                         goto error;
1199                                 iomap_valid = xfs_iomap_valid(&iomap, offset);
1200                         }
1201
1202                         /*
1203                          * We set the type to IOMAP_NEW in case we are doing a
1204                          * small write at EOF that is extending the file but
1205                          * without needing an allocation. We need to update the
1206                          * file size on I/O completion in this case so it is
1207                          * the same case as having just allocated a new extent
1208                          * that we are writing into for the first time.
1209                          */
1210                         type = IOMAP_NEW;
1211                         if (trylock_buffer(bh)) {
1212                                 ASSERT(buffer_mapped(bh));
1213                                 if (iomap_valid)
1214                                         all_bh = 1;
1215                                 xfs_add_to_ioend(inode, bh, offset, type,
1216                                                 &ioend, !iomap_valid);
1217                                 page_dirty--;
1218                                 count++;
1219                         } else {
1220                                 iomap_valid = 0;
1221                         }
1222                 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
1223                            (unmapped || startio)) {
1224                         iomap_valid = 0;
1225                 }
1226
1227                 if (!iohead)
1228                         iohead = ioend;
1229
1230         } while (offset += len, ((bh = bh->b_this_page) != head));
1231
1232         if (uptodate && bh == head)
1233                 SetPageUptodate(page);
1234
1235         if (startio)
1236                 xfs_start_page_writeback(page, 1, count);
1237
1238         if (ioend && iomap_valid) {
1239                 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
1240                                         PAGE_CACHE_SHIFT;
1241                 tlast = min_t(pgoff_t, offset, last_index);
1242                 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
1243                                         wbc, startio, all_bh, tlast);
1244         }
1245
1246         if (iohead)
1247                 xfs_submit_ioend(iohead);
1248
1249         return page_dirty;
1250
1251 error:
1252         if (iohead)
1253                 xfs_cancel_ioend(iohead);
1254
1255         /*
1256          * If it's delalloc and we have nowhere to put it,
1257          * throw it away, unless the lower layers told
1258          * us to try again.
1259          */
1260         if (err != -EAGAIN) {
1261                 if (!unmapped)
1262                         block_invalidatepage(page, 0);
1263                 ClearPageUptodate(page);
1264         }
1265         return err;
1266 }
1267
1268 /*
1269  * writepage: Called from one of two places:
1270  *
1271  * 1. we are flushing a delalloc buffer head.
1272  *
1273  * 2. we are writing out a dirty page. Typically the page dirty
1274  *    state is cleared before we get here. In this case is it
1275  *    conceivable we have no buffer heads.
1276  *
1277  * For delalloc space on the page we need to allocate space and
1278  * flush it. For unmapped buffer heads on the page we should
1279  * allocate space if the page is uptodate. For any other dirty
1280  * buffer heads on the page we should flush them.
1281  *
1282  * If we detect that a transaction would be required to flush
1283  * the page, we have to check the process flags first, if we
1284  * are already in a transaction or disk I/O during allocations
1285  * is off, we need to fail the writepage and redirty the page.
1286  */
1287
1288 STATIC int
1289 xfs_vm_writepage(
1290         struct page             *page,
1291         struct writeback_control *wbc)
1292 {
1293         int                     error;
1294         int                     need_trans;
1295         int                     delalloc, unmapped, unwritten;
1296         struct inode            *inode = page->mapping->host;
1297
1298         xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
1299
1300         /*
1301          * We need a transaction if:
1302          *  1. There are delalloc buffers on the page
1303          *  2. The page is uptodate and we have unmapped buffers
1304          *  3. The page is uptodate and we have no buffers
1305          *  4. There are unwritten buffers on the page
1306          */
1307
1308         if (!page_has_buffers(page)) {
1309                 unmapped = 1;
1310                 need_trans = 1;
1311         } else {
1312                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1313                 if (!PageUptodate(page))
1314                         unmapped = 0;
1315                 need_trans = delalloc + unmapped + unwritten;
1316         }
1317
1318         /*
1319          * If we need a transaction and the process flags say
1320          * we are already in a transaction, or no IO is allowed
1321          * then mark the page dirty again and leave the page
1322          * as is.
1323          */
1324         if (current_test_flags(PF_FSTRANS) && need_trans)
1325                 goto out_fail;
1326
1327         /*
1328          * Delay hooking up buffer heads until we have
1329          * made our go/no-go decision.
1330          */
1331         if (!page_has_buffers(page))
1332                 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1333
1334
1335         /*
1336          *  VM calculation for nr_to_write seems off.  Bump it way
1337          *  up, this gets simple streaming writes zippy again.
1338          *  To be reviewed again after Jens' writeback changes.
1339          */
1340         wbc->nr_to_write *= 4;
1341
1342         /*
1343          * Convert delayed allocate, unwritten or unmapped space
1344          * to real space and flush out to disk.
1345          */
1346         error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
1347         if (error == -EAGAIN)
1348                 goto out_fail;
1349         if (unlikely(error < 0))
1350                 goto out_unlock;
1351
1352         return 0;
1353
1354 out_fail:
1355         redirty_page_for_writepage(wbc, page);
1356         unlock_page(page);
1357         return 0;
1358 out_unlock:
1359         unlock_page(page);
1360         return error;
1361 }
1362
1363 STATIC int
1364 xfs_vm_writepages(
1365         struct address_space    *mapping,
1366         struct writeback_control *wbc)
1367 {
1368         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1369         return generic_writepages(mapping, wbc);
1370 }
1371
1372 /*
1373  * Called to move a page into cleanable state - and from there
1374  * to be released. Possibly the page is already clean. We always
1375  * have buffer heads in this call.
1376  *
1377  * Returns 0 if the page is ok to release, 1 otherwise.
1378  *
1379  * Possible scenarios are:
1380  *
1381  * 1. We are being called to release a page which has been written
1382  *    to via regular I/O. buffer heads will be dirty and possibly
1383  *    delalloc. If no delalloc buffer heads in this case then we
1384  *    can just return zero.
1385  *
1386  * 2. We are called to release a page which has been written via
1387  *    mmap, all we need to do is ensure there is no delalloc
1388  *    state in the buffer heads, if not we can let the caller
1389  *    free them and we should come back later via writepage.
1390  */
1391 STATIC int
1392 xfs_vm_releasepage(
1393         struct page             *page,
1394         gfp_t                   gfp_mask)
1395 {
1396         struct inode            *inode = page->mapping->host;
1397         int                     dirty, delalloc, unmapped, unwritten;
1398         struct writeback_control wbc = {
1399                 .sync_mode = WB_SYNC_ALL,
1400                 .nr_to_write = 1,
1401         };
1402
1403         xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
1404
1405         if (!page_has_buffers(page))
1406                 return 0;
1407
1408         xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1409         if (!delalloc && !unwritten)
1410                 goto free_buffers;
1411
1412         if (!(gfp_mask & __GFP_FS))
1413                 return 0;
1414
1415         /* If we are already inside a transaction or the thread cannot
1416          * do I/O, we cannot release this page.
1417          */
1418         if (current_test_flags(PF_FSTRANS))
1419                 return 0;
1420
1421         /*
1422          * Convert delalloc space to real space, do not flush the
1423          * data out to disk, that will be done by the caller.
1424          * Never need to allocate space here - we will always
1425          * come back to writepage in that case.
1426          */
1427         dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
1428         if (dirty == 0 && !unwritten)
1429                 goto free_buffers;
1430         return 0;
1431
1432 free_buffers:
1433         return try_to_free_buffers(page);
1434 }
1435
1436 STATIC int
1437 __xfs_get_blocks(
1438         struct inode            *inode,
1439         sector_t                iblock,
1440         struct buffer_head      *bh_result,
1441         int                     create,
1442         int                     direct,
1443         bmapi_flags_t           flags)
1444 {
1445         xfs_iomap_t             iomap;
1446         xfs_off_t               offset;
1447         ssize_t                 size;
1448         int                     niomap = 1;
1449         int                     error;
1450
1451         offset = (xfs_off_t)iblock << inode->i_blkbits;
1452         ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1453         size = bh_result->b_size;
1454
1455         if (!create && direct && offset >= i_size_read(inode))
1456                 return 0;
1457
1458         error = xfs_iomap(XFS_I(inode), offset, size,
1459                              create ? flags : BMAPI_READ, &iomap, &niomap);
1460         if (error)
1461                 return -error;
1462         if (niomap == 0)
1463                 return 0;
1464
1465         if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
1466                 /*
1467                  * For unwritten extents do not report a disk address on
1468                  * the read case (treat as if we're reading into a hole).
1469                  */
1470                 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1471                         xfs_map_buffer(bh_result, &iomap, offset,
1472                                        inode->i_blkbits);
1473                 }
1474                 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1475                         if (direct)
1476                                 bh_result->b_private = inode;
1477                         set_buffer_unwritten(bh_result);
1478                 }
1479         }
1480
1481         /*
1482          * If this is a realtime file, data may be on a different device.
1483          * to that pointed to from the buffer_head b_bdev currently.
1484          */
1485         bh_result->b_bdev = iomap.iomap_target->bt_bdev;
1486
1487         /*
1488          * If we previously allocated a block out beyond eof and we are now
1489          * coming back to use it then we will need to flag it as new even if it
1490          * has a disk address.
1491          *
1492          * With sub-block writes into unwritten extents we also need to mark
1493          * the buffer as new so that the unwritten parts of the buffer gets
1494          * correctly zeroed.
1495          */
1496         if (create &&
1497             ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1498              (offset >= i_size_read(inode)) ||
1499              (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
1500                 set_buffer_new(bh_result);
1501
1502         if (iomap.iomap_flags & IOMAP_DELAY) {
1503                 BUG_ON(direct);
1504                 if (create) {
1505                         set_buffer_uptodate(bh_result);
1506                         set_buffer_mapped(bh_result);
1507                         set_buffer_delay(bh_result);
1508                 }
1509         }
1510
1511         if (direct || size > (1 << inode->i_blkbits)) {
1512                 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
1513                 offset = min_t(xfs_off_t,
1514                                 iomap.iomap_bsize - iomap.iomap_delta, size);
1515                 bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
1516         }
1517
1518         return 0;
1519 }
1520
1521 int
1522 xfs_get_blocks(
1523         struct inode            *inode,
1524         sector_t                iblock,
1525         struct buffer_head      *bh_result,
1526         int                     create)
1527 {
1528         return __xfs_get_blocks(inode, iblock,
1529                                 bh_result, create, 0, BMAPI_WRITE);
1530 }
1531
1532 STATIC int
1533 xfs_get_blocks_direct(
1534         struct inode            *inode,
1535         sector_t                iblock,
1536         struct buffer_head      *bh_result,
1537         int                     create)
1538 {
1539         return __xfs_get_blocks(inode, iblock,
1540                                 bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1541 }
1542
1543 STATIC void
1544 xfs_end_io_direct(
1545         struct kiocb    *iocb,
1546         loff_t          offset,
1547         ssize_t         size,
1548         void            *private)
1549 {
1550         xfs_ioend_t     *ioend = iocb->private;
1551
1552         /*
1553          * Non-NULL private data means we need to issue a transaction to
1554          * convert a range from unwritten to written extents.  This needs
1555          * to happen from process context but aio+dio I/O completion
1556          * happens from irq context so we need to defer it to a workqueue.
1557          * This is not necessary for synchronous direct I/O, but we do
1558          * it anyway to keep the code uniform and simpler.
1559          *
1560          * Well, if only it were that simple. Because synchronous direct I/O
1561          * requires extent conversion to occur *before* we return to userspace,
1562          * we have to wait for extent conversion to complete. Look at the
1563          * iocb that has been passed to us to determine if this is AIO or
1564          * not. If it is synchronous, tell xfs_finish_ioend() to kick the
1565          * workqueue and wait for it to complete.
1566          *
1567          * The core direct I/O code might be changed to always call the
1568          * completion handler in the future, in which case all this can
1569          * go away.
1570          */
1571         ioend->io_offset = offset;
1572         ioend->io_size = size;
1573         if (ioend->io_type == IOMAP_READ) {
1574                 xfs_finish_ioend(ioend, 0);
1575         } else if (private && size > 0) {
1576                 xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
1577         } else {
1578                 /*
1579                  * A direct I/O write ioend starts it's life in unwritten
1580                  * state in case they map an unwritten extent.  This write
1581                  * didn't map an unwritten extent so switch it's completion
1582                  * handler.
1583                  */
1584                 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
1585                 xfs_finish_ioend(ioend, 0);
1586         }
1587
1588         /*
1589          * blockdev_direct_IO can return an error even after the I/O
1590          * completion handler was called.  Thus we need to protect
1591          * against double-freeing.
1592          */
1593         iocb->private = NULL;
1594 }
1595
1596 STATIC ssize_t
1597 xfs_vm_direct_IO(
1598         int                     rw,
1599         struct kiocb            *iocb,
1600         const struct iovec      *iov,
1601         loff_t                  offset,
1602         unsigned long           nr_segs)
1603 {
1604         struct file     *file = iocb->ki_filp;
1605         struct inode    *inode = file->f_mapping->host;
1606         struct block_device *bdev;
1607         ssize_t         ret;
1608
1609         bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1610
1611         if (rw == WRITE) {
1612                 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1613                 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1614                         bdev, iov, offset, nr_segs,
1615                         xfs_get_blocks_direct,
1616                         xfs_end_io_direct);
1617         } else {
1618                 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1619                 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1620                         bdev, iov, offset, nr_segs,
1621                         xfs_get_blocks_direct,
1622                         xfs_end_io_direct);
1623         }
1624
1625         if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1626                 xfs_destroy_ioend(iocb->private);
1627         return ret;
1628 }
1629
1630 STATIC int
1631 xfs_vm_write_begin(
1632         struct file             *file,
1633         struct address_space    *mapping,
1634         loff_t                  pos,
1635         unsigned                len,
1636         unsigned                flags,
1637         struct page             **pagep,
1638         void                    **fsdata)
1639 {
1640         *pagep = NULL;
1641         return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1642                                                                 xfs_get_blocks);
1643 }
1644
1645 STATIC sector_t
1646 xfs_vm_bmap(
1647         struct address_space    *mapping,
1648         sector_t                block)
1649 {
1650         struct inode            *inode = (struct inode *)mapping->host;
1651         struct xfs_inode        *ip = XFS_I(inode);
1652
1653         xfs_itrace_entry(XFS_I(inode));
1654         xfs_ilock(ip, XFS_IOLOCK_SHARED);
1655         xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1656         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1657         return generic_block_bmap(mapping, block, xfs_get_blocks);
1658 }
1659
1660 STATIC int
1661 xfs_vm_readpage(
1662         struct file             *unused,
1663         struct page             *page)
1664 {
1665         return mpage_readpage(page, xfs_get_blocks);
1666 }
1667
1668 STATIC int
1669 xfs_vm_readpages(
1670         struct file             *unused,
1671         struct address_space    *mapping,
1672         struct list_head        *pages,
1673         unsigned                nr_pages)
1674 {
1675         return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1676 }
1677
1678 STATIC void
1679 xfs_vm_invalidatepage(
1680         struct page             *page,
1681         unsigned long           offset)
1682 {
1683         xfs_page_trace(XFS_INVALIDPAGE_ENTER,
1684                         page->mapping->host, page, offset);
1685         block_invalidatepage(page, offset);
1686 }
1687
1688 const struct address_space_operations xfs_address_space_operations = {
1689         .readpage               = xfs_vm_readpage,
1690         .readpages              = xfs_vm_readpages,
1691         .writepage              = xfs_vm_writepage,
1692         .writepages             = xfs_vm_writepages,
1693         .sync_page              = block_sync_page,
1694         .releasepage            = xfs_vm_releasepage,
1695         .invalidatepage         = xfs_vm_invalidatepage,
1696         .write_begin            = xfs_vm_write_begin,
1697         .write_end              = generic_write_end,
1698         .bmap                   = xfs_vm_bmap,
1699         .direct_IO              = xfs_vm_direct_IO,
1700         .migratepage            = buffer_migrate_page,
1701         .is_partially_uptodate  = block_is_partially_uptodate,
1702         .error_remove_page      = generic_error_remove_page,
1703 };