fs/bcachefs/fs-io-buffered.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #ifndef NO_BCACHEFS_FS
   3
   4 #include "bcachefs.h"
   5 #include "alloc_foreground.h"
   6 #include "bkey_buf.h"
   7 #include "fs-io.h"
   8 #include "fs-io-buffered.h"
   9 #include "fs-io-direct.h"
  10 #include "fs-io-pagecache.h"
  11 #include "io_read.h"
  12 #include "io_write.h"
  13
  14 #include <linux/backing-dev.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/writeback.h>
  17
  18 static inline bool bio_full(struct bio *bio, unsigned len)
  19 {
  20         if (bio->bi_vcnt >= bio->bi_max_vecs)
  21                 return true;
  22         if (bio->bi_iter.bi_size > UINT_MAX - len)
  23                 return true;
  24         return false;
  25 }
  26
  27 /* readpage(s): */
  28
  29 static void bch2_readpages_end_io(struct bio *bio)
  30 {
  31         struct folio_iter fi;
  32
  33         bio_for_each_folio_all(fi, bio)
  34                 folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
  35
  36         bio_put(bio);
  37 }
  38
  39 struct readpages_iter {
  40         struct address_space    *mapping;
  41         unsigned                idx;
  42         folios                  folios;
  43 };
  44
  45 static int readpages_iter_init(struct readpages_iter *iter,
  46                                struct readahead_control *ractl)
  47 {
  48         struct folio *folio;
  49
  50         *iter = (struct readpages_iter) { ractl->mapping };
  51
  52         while ((folio = __readahead_folio(ractl))) {
  53                 if (!bch2_folio_create(folio, GFP_KERNEL) ||
  54                     darray_push(&iter->folios, folio)) {
  55                         bch2_folio_release(folio);
  56                         ractl->_nr_pages += folio_nr_pages(folio);
  57                         ractl->_index -= folio_nr_pages(folio);
  58                         return iter->folios.nr ? 0 : -ENOMEM;
  59                 }
  60
  61                 folio_put(folio);
  62         }
  63
  64         return 0;
  65 }
  66
  67 static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
  68 {
  69         if (iter->idx >= iter->folios.nr)
  70                 return NULL;
  71         return iter->folios.data[iter->idx];
  72 }
  73
  74 static inline void readpage_iter_advance(struct readpages_iter *iter)
  75 {
  76         iter->idx++;
  77 }
  78
  79 static bool extent_partial_reads_expensive(struct bkey_s_c k)
  80 {
  81         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
  82         struct bch_extent_crc_unpacked crc;
  83         const union bch_extent_entry *i;
  84
  85         bkey_for_each_crc(k.k, ptrs, crc, i)
  86                 if (crc.csum_type || crc.compression_type)
  87                         return true;
  88         return false;
  89 }
  90
  91 static int readpage_bio_extend(struct btree_trans *trans,
  92                                struct readpages_iter *iter,
  93                                struct bio *bio,
  94                                unsigned sectors_this_extent,
  95                                bool get_more)
  96 {
  97         /* Don't hold btree locks while allocating memory: */
  98         bch2_trans_unlock(trans);
  99
 100         while (bio_sectors(bio) < sectors_this_extent &&
 101                bio->bi_vcnt < bio->bi_max_vecs) {
 102                 struct folio *folio = readpage_iter_peek(iter);
 103                 int ret;
 104
 105                 if (folio) {
 106                         readpage_iter_advance(iter);
 107                 } else {
 108                         pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
 109
 110                         if (!get_more)
 111                                 break;
 112
 113                         folio = xa_load(&iter->mapping->i_pages, folio_offset);
 114                         if (folio && !xa_is_value(folio))
 115                                 break;
 116
 117                         folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
 118                         if (!folio)
 119                                 break;
 120
 121                         if (!__bch2_folio_create(folio, GFP_KERNEL)) {
 122                                 folio_put(folio);
 123                                 break;
 124                         }
 125
 126                         ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
 127                         if (ret) {
 128                                 __bch2_folio_release(folio);
 129                                 folio_put(folio);
 130                                 break;
 131                         }
 132
 133                         folio_put(folio);
 134                 }
 135
 136                 BUG_ON(folio_sector(folio) != bio_end_sector(bio));
 137
 138                 BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
 139         }
 140
 141         return bch2_trans_relock(trans);
 142 }
 143
 144 static void bchfs_read(struct btree_trans *trans,
 145                        struct bch_read_bio *rbio,
 146                        subvol_inum inum,
 147                        struct readpages_iter *readpages_iter)
 148 {
 149         struct bch_fs *c = trans->c;
 150         struct btree_iter iter;
 151         struct bkey_buf sk;
 152         int flags = BCH_READ_RETRY_IF_STALE|
 153                 BCH_READ_MAY_PROMOTE;
 154         int ret = 0;
 155
 156         rbio->c = c;
 157         rbio->start_time = local_clock();
 158         rbio->subvol = inum.subvol;
 159
 160         bch2_bkey_buf_init(&sk);
 161         bch2_trans_begin(trans);
 162         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 163                              POS(inum.inum, rbio->bio.bi_iter.bi_sector),
 164                              BTREE_ITER_slots);
 165         while (1) {
 166                 struct bkey_s_c k;
 167                 unsigned bytes, sectors, offset_into_extent;
 168                 enum btree_id data_btree = BTREE_ID_extents;
 169
 170                 bch2_trans_begin(trans);
 171
 172                 u32 snapshot;
 173                 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 174                 if (ret)
 175                         goto err;
 176
 177                 bch2_btree_iter_set_snapshot(&iter, snapshot);
 178
 179                 bch2_btree_iter_set_pos(&iter,
 180                                 POS(inum.inum, rbio->bio.bi_iter.bi_sector));
 181
 182                 k = bch2_btree_iter_peek_slot(&iter);
 183                 ret = bkey_err(k);
 184                 if (ret)
 185                         goto err;
 186
 187                 offset_into_extent = iter.pos.offset -
 188                         bkey_start_offset(k.k);
 189                 sectors = k.k->size - offset_into_extent;
 190
 191                 bch2_bkey_buf_reassemble(&sk, c, k);
 192
 193                 ret = bch2_read_indirect_extent(trans, &data_btree,
 194                                         &offset_into_extent, &sk);
 195                 if (ret)
 196                         goto err;
 197
 198                 k = bkey_i_to_s_c(sk.k);
 199
 200                 sectors = min(sectors, k.k->size - offset_into_extent);
 201
 202                 if (readpages_iter) {
 203                         ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
 204                                                   extent_partial_reads_expensive(k));
 205                         if (ret)
 206                                 goto err;
 207                 }
 208
 209                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
 210                 swap(rbio->bio.bi_iter.bi_size, bytes);
 211
 212                 if (rbio->bio.bi_iter.bi_size == bytes)
 213                         flags |= BCH_READ_LAST_FRAGMENT;
 214
 215                 bch2_bio_page_state_set(&rbio->bio, k);
 216
 217                 bch2_read_extent(trans, rbio, iter.pos,
 218                                  data_btree, k, offset_into_extent, flags);
 219
 220                 if (flags & BCH_READ_LAST_FRAGMENT)
 221                         break;
 222
 223                 swap(rbio->bio.bi_iter.bi_size, bytes);
 224                 bio_advance(&rbio->bio, bytes);
 225 err:
 226                 if (ret &&
 227                     !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 228                         break;
 229         }
 230         bch2_trans_iter_exit(trans, &iter);
 231
 232         if (ret) {
 233                 bch_err_inum_offset_ratelimited(c,
 234                                 iter.pos.inode,
 235                                 iter.pos.offset << 9,
 236                                 "read error %i from btree lookup", ret);
 237                 rbio->bio.bi_status = BLK_STS_IOERR;
 238                 bio_endio(&rbio->bio);
 239         }
 240
 241         bch2_bkey_buf_exit(&sk, c);
 242 }
 243
 244 void bch2_readahead(struct readahead_control *ractl)
 245 {
 246         struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
 247         struct bch_fs *c = inode->v.i_sb->s_fs_info;
 248         struct bch_io_opts opts;
 249         struct folio *folio;
 250         struct readpages_iter readpages_iter;
 251
 252         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 253
 254         int ret = readpages_iter_init(&readpages_iter, ractl);
 255         if (ret)
 256                 return;
 257
 258         bch2_pagecache_add_get(inode);
 259
 260         struct btree_trans *trans = bch2_trans_get(c);
 261         while ((folio = readpage_iter_peek(&readpages_iter))) {
 262                 unsigned n = min_t(unsigned,
 263                                    readpages_iter.folios.nr -
 264                                    readpages_iter.idx,
 265                                    BIO_MAX_VECS);
 266                 struct bch_read_bio *rbio =
 267                         rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
 268                                                    GFP_KERNEL, &c->bio_read),
 269                                   opts);
 270
 271                 readpage_iter_advance(&readpages_iter);
 272
 273                 rbio->bio.bi_iter.bi_sector = folio_sector(folio);
 274                 rbio->bio.bi_end_io = bch2_readpages_end_io;
 275                 BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 276
 277                 bchfs_read(trans, rbio, inode_inum(inode),
 278                            &readpages_iter);
 279                 bch2_trans_unlock(trans);
 280         }
 281         bch2_trans_put(trans);
 282
 283         bch2_pagecache_add_put(inode);
 284
 285         darray_exit(&readpages_iter.folios);
 286 }
 287
 288 static void bch2_read_single_folio_end_io(struct bio *bio)
 289 {
 290         complete(bio->bi_private);
 291 }
 292
 293 int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
 294 {
 295         struct bch_inode_info *inode = to_bch_ei(mapping->host);
 296         struct bch_fs *c = inode->v.i_sb->s_fs_info;
 297         struct bch_read_bio *rbio;
 298         struct bch_io_opts opts;
 299         int ret;
 300         DECLARE_COMPLETION_ONSTACK(done);
 301
 302         if (!bch2_folio_create(folio, GFP_KERNEL))
 303                 return -ENOMEM;
 304
 305         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 306
 307         rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
 308                          opts);
 309         rbio->bio.bi_private = &done;
 310         rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
 311
 312         rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 313         rbio->bio.bi_iter.bi_sector = folio_sector(folio);
 314         BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 315
 316         bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
 317         wait_for_completion(&done);
 318
 319         ret = blk_status_to_errno(rbio->bio.bi_status);
 320         bio_put(&rbio->bio);
 321
 322         if (ret < 0)
 323                 return ret;
 324
 325         folio_mark_uptodate(folio);
 326         return 0;
 327 }
 328
 329 int bch2_read_folio(struct file *file, struct folio *folio)
 330 {
 331         int ret;
 332
 333         ret = bch2_read_single_folio(folio, folio->mapping);
 334         folio_unlock(folio);
 335         return bch2_err_class(ret);
 336 }
 337
 338 /* writepages: */
 339
 340 struct bch_writepage_io {
 341         struct bch_inode_info           *inode;
 342
 343         /* must be last: */
 344         struct bch_write_op             op;
 345 };
 346
 347 struct bch_writepage_state {
 348         struct bch_writepage_io *io;
 349         struct bch_io_opts      opts;
 350         struct bch_folio_sector *tmp;
 351         unsigned                tmp_sectors;
 352 };
 353
 354 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
 355                                                                   struct bch_inode_info *inode)
 356 {
 357         struct bch_writepage_state ret = { 0 };
 358
 359         bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
 360         return ret;
 361 }
 362
 363 /*
 364  * Determine when a writepage io is full. We have to limit writepage bios to a
 365  * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
 366  * what the bounce path in bch2_write_extent() can handle. In theory we could
 367  * loosen this restriction for non-bounce I/O, but we don't have that context
 368  * here. Ideally, we can up this limit and make it configurable in the future
 369  * when the bounce path can be enhanced to accommodate larger source bios.
 370  */
 371 static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
 372 {
 373         struct bio *bio = &io->op.wbio.bio;
 374         return bio_full(bio, len) ||
 375                 (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
 376 }
 377
 378 static void bch2_writepage_io_done(struct bch_write_op *op)
 379 {
 380         struct bch_writepage_io *io =
 381                 container_of(op, struct bch_writepage_io, op);
 382         struct bch_fs *c = io->op.c;
 383         struct bio *bio = &io->op.wbio.bio;
 384         struct folio_iter fi;
 385         unsigned i;
 386
 387         if (io->op.error) {
 388                 set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
 389
 390                 bio_for_each_folio_all(fi, bio) {
 391                         struct bch_folio *s;
 392
 393                         mapping_set_error(fi.folio->mapping, -EIO);
 394
 395                         s = __bch2_folio(fi.folio);
 396                         spin_lock(&s->lock);
 397                         for (i = 0; i < folio_sectors(fi.folio); i++)
 398                                 s->s[i].nr_replicas = 0;
 399                         spin_unlock(&s->lock);
 400                 }
 401         }
 402
 403         if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
 404                 bio_for_each_folio_all(fi, bio) {
 405                         struct bch_folio *s;
 406
 407                         s = __bch2_folio(fi.folio);
 408                         spin_lock(&s->lock);
 409                         for (i = 0; i < folio_sectors(fi.folio); i++)
 410                                 s->s[i].nr_replicas = 0;
 411                         spin_unlock(&s->lock);
 412                 }
 413         }
 414
 415         /*
 416          * racing with fallocate can cause us to add fewer sectors than
 417          * expected - but we shouldn't add more sectors than expected:
 418          */
 419         WARN_ON_ONCE(io->op.i_sectors_delta > 0);
 420
 421         /*
 422          * (error (due to going RO) halfway through a page can screw that up
 423          * slightly)
 424          * XXX wtf?
 425            BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
 426          */
 427
 428         /*
 429          * The writeback flag is effectively our ref on the inode -
 430          * fixup i_blocks before calling folio_end_writeback:
 431          */
 432         bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 433
 434         bio_for_each_folio_all(fi, bio) {
 435                 struct bch_folio *s = __bch2_folio(fi.folio);
 436
 437                 if (atomic_dec_and_test(&s->write_count))
 438                         folio_end_writeback(fi.folio);
 439         }
 440
 441         bio_put(&io->op.wbio.bio);
 442 }
 443
 444 static void bch2_writepage_do_io(struct bch_writepage_state *w)
 445 {
 446         struct bch_writepage_io *io = w->io;
 447
 448         w->io = NULL;
 449         closure_call(&io->op.cl, bch2_write, NULL, NULL);
 450 }
 451
 452 /*
 453  * Get a bch_writepage_io and add @page to it - appending to an existing one if
 454  * possible, else allocating a new one:
 455  */
 456 static void bch2_writepage_io_alloc(struct bch_fs *c,
 457                                     struct writeback_control *wbc,
 458                                     struct bch_writepage_state *w,
 459                                     struct bch_inode_info *inode,
 460                                     u64 sector,
 461                                     unsigned nr_replicas)
 462 {
 463         struct bch_write_op *op;
 464
 465         w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
 466                                               REQ_OP_WRITE,
 467                                               GFP_KERNEL,
 468                                               &c->writepage_bioset),
 469                              struct bch_writepage_io, op.wbio.bio);
 470
 471         w->io->inode            = inode;
 472         op                      = &w->io->op;
 473         bch2_write_op_init(op, c, w->opts);
 474         op->target              = w->opts.foreground_target;
 475         op->nr_replicas         = nr_replicas;
 476         op->res.nr_replicas     = nr_replicas;
 477         op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
 478         op->subvol              = inode->ei_inum.subvol;
 479         op->pos                 = POS(inode->v.i_ino, sector);
 480         op->end_io              = bch2_writepage_io_done;
 481         op->devs_need_flush     = &inode->ei_devs_need_flush;
 482         op->wbio.bio.bi_iter.bi_sector = sector;
 483         op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
 484 }
 485
 486 static int __bch2_writepage(struct folio *folio,
 487                             struct writeback_control *wbc,
 488                             void *data)
 489 {
 490         struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
 491         struct bch_fs *c = inode->v.i_sb->s_fs_info;
 492         struct bch_writepage_state *w = data;
 493         struct bch_folio *s;
 494         unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
 495         loff_t i_size = i_size_read(&inode->v);
 496         int ret;
 497
 498         EBUG_ON(!folio_test_uptodate(folio));
 499
 500         /* Is the folio fully inside i_size? */
 501         if (folio_end_pos(folio) <= i_size)
 502                 goto do_io;
 503
 504         /* Is the folio fully outside i_size? (truncate in progress) */
 505         if (folio_pos(folio) >= i_size) {
 506                 folio_unlock(folio);
 507                 return 0;
 508         }
 509
 510         /*
 511          * The folio straddles i_size.  It must be zeroed out on each and every
 512          * writepage invocation because it may be mmapped.  "A file is mapped
 513          * in multiples of the folio size.  For a file that is not a multiple of
 514          * the  folio size, the remaining memory is zeroed when mapped, and
 515          * writes to that region are not written out to the file."
 516          */
 517         folio_zero_segment(folio,
 518                            i_size - folio_pos(folio),
 519                            folio_size(folio));
 520 do_io:
 521         f_sectors = folio_sectors(folio);
 522         s = bch2_folio(folio);
 523
 524         if (f_sectors > w->tmp_sectors) {
 525                 kfree(w->tmp);
 526                 w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
 527                 w->tmp_sectors = f_sectors;
 528         }
 529
 530         /*
 531          * Things get really hairy with errors during writeback:
 532          */
 533         ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
 534         BUG_ON(ret);
 535
 536         /* Before unlocking the page, get copy of reservations: */
 537         spin_lock(&s->lock);
 538         memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
 539
 540         for (i = 0; i < f_sectors; i++) {
 541                 if (s->s[i].state < SECTOR_dirty)
 542                         continue;
 543
 544                 nr_replicas_this_write =
 545                         min_t(unsigned, nr_replicas_this_write,
 546                               s->s[i].nr_replicas +
 547                               s->s[i].replicas_reserved);
 548         }
 549
 550         for (i = 0; i < f_sectors; i++) {
 551                 if (s->s[i].state < SECTOR_dirty)
 552                         continue;
 553
 554                 s->s[i].nr_replicas = w->opts.compression
 555                         ? 0 : nr_replicas_this_write;
 556
 557                 s->s[i].replicas_reserved = 0;
 558                 bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
 559         }
 560         spin_unlock(&s->lock);
 561
 562         BUG_ON(atomic_read(&s->write_count));
 563         atomic_set(&s->write_count, 1);
 564
 565         BUG_ON(folio_test_writeback(folio));
 566         folio_start_writeback(folio);
 567
 568         folio_unlock(folio);
 569
 570         offset = 0;
 571         while (1) {
 572                 unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
 573                 u64 sector;
 574
 575                 while (offset < f_sectors &&
 576                        w->tmp[offset].state < SECTOR_dirty)
 577                         offset++;
 578
 579                 if (offset == f_sectors)
 580                         break;
 581
 582                 while (offset + sectors < f_sectors &&
 583                        w->tmp[offset + sectors].state >= SECTOR_dirty) {
 584                         reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
 585                         dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
 586                         sectors++;
 587                 }
 588                 BUG_ON(!sectors);
 589
 590                 sector = folio_sector(folio) + offset;
 591
 592                 if (w->io &&
 593                     (w->io->op.res.nr_replicas != nr_replicas_this_write ||
 594                      bch_io_full(w->io, sectors << 9) ||
 595                      bio_end_sector(&w->io->op.wbio.bio) != sector))
 596                         bch2_writepage_do_io(w);
 597
 598                 if (!w->io)
 599                         bch2_writepage_io_alloc(c, wbc, w, inode, sector,
 600                                                 nr_replicas_this_write);
 601
 602                 atomic_inc(&s->write_count);
 603
 604                 BUG_ON(inode != w->io->inode);
 605                 BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
 606                                      sectors << 9, offset << 9));
 607
 608                 /* Check for writing past i_size: */
 609                 WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
 610                           round_up(i_size, block_bytes(c)) &&
 611                           !test_bit(BCH_FS_emergency_ro, &c->flags),
 612                           "writing past i_size: %llu > %llu (unrounded %llu)\n",
 613                           bio_end_sector(&w->io->op.wbio.bio) << 9,
 614                           round_up(i_size, block_bytes(c)),
 615                           i_size);
 616
 617                 w->io->op.res.sectors += reserved_sectors;
 618                 w->io->op.i_sectors_delta -= dirty_sectors;
 619                 w->io->op.new_i_size = i_size;
 620
 621                 offset += sectors;
 622         }
 623
 624         if (atomic_dec_and_test(&s->write_count))
 625                 folio_end_writeback(folio);
 626
 627         return 0;
 628 }
 629
 630 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 631 {
 632         struct bch_fs *c = mapping->host->i_sb->s_fs_info;
 633         struct bch_writepage_state w =
 634                 bch_writepage_state_init(c, to_bch_ei(mapping->host));
 635         struct blk_plug plug;
 636         int ret;
 637
 638         blk_start_plug(&plug);
 639         ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
 640         if (w.io)
 641                 bch2_writepage_do_io(&w);
 642         blk_finish_plug(&plug);
 643         kfree(w.tmp);
 644         return bch2_err_class(ret);
 645 }
 646
 647 /* buffered writes: */
 648
 649 int bch2_write_begin(struct file *file, struct address_space *mapping,
 650                      loff_t pos, unsigned len,
 651                      struct folio **foliop, void **fsdata)
 652 {
 653         struct bch_inode_info *inode = to_bch_ei(mapping->host);
 654         struct bch_fs *c = inode->v.i_sb->s_fs_info;
 655         struct bch2_folio_reservation *res;
 656         struct folio *folio;
 657         unsigned offset;
 658         int ret = -ENOMEM;
 659
 660         res = kmalloc(sizeof(*res), GFP_KERNEL);
 661         if (!res)
 662                 return -ENOMEM;
 663
 664         bch2_folio_reservation_init(c, inode, res);
 665         *fsdata = res;
 666
 667         bch2_pagecache_add_get(inode);
 668
 669         folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
 670                                     FGP_WRITEBEGIN | fgf_set_order(len),
 671                                     mapping_gfp_mask(mapping));
 672         if (IS_ERR_OR_NULL(folio))
 673                 goto err_unlock;
 674
 675         offset = pos - folio_pos(folio);
 676         len = min_t(size_t, len, folio_end_pos(folio) - pos);
 677
 678         if (folio_test_uptodate(folio))
 679                 goto out;
 680
 681         /* If we're writing entire folio, don't need to read it in first: */
 682         if (!offset && len == folio_size(folio))
 683                 goto out;
 684
 685         if (!offset && pos + len >= inode->v.i_size) {
 686                 folio_zero_segment(folio, len, folio_size(folio));
 687                 flush_dcache_folio(folio);
 688                 goto out;
 689         }
 690
 691         if (folio_pos(folio) >= inode->v.i_size) {
 692                 folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
 693                 flush_dcache_folio(folio);
 694                 goto out;
 695         }
 696 readpage:
 697         ret = bch2_read_single_folio(folio, mapping);
 698         if (ret)
 699                 goto err;
 700 out:
 701         ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
 702         if (ret)
 703                 goto err;
 704
 705         ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
 706         if (ret) {
 707                 if (!folio_test_uptodate(folio)) {
 708                         /*
 709                          * If the folio hasn't been read in, we won't know if we
 710                          * actually need a reservation - we don't actually need
 711                          * to read here, we just need to check if the folio is
 712                          * fully backed by uncompressed data:
 713                          */
 714                         goto readpage;
 715                 }
 716
 717                 goto err;
 718         }
 719
 720         *foliop = folio;
 721         return 0;
 722 err:
 723         folio_unlock(folio);
 724         folio_put(folio);
 725 err_unlock:
 726         bch2_pagecache_add_put(inode);
 727         kfree(res);
 728         *fsdata = NULL;
 729         return bch2_err_class(ret);
 730 }
 731
 732 int bch2_write_end(struct file *file, struct address_space *mapping,
 733                    loff_t pos, unsigned len, unsigned copied,
 734                    struct folio *folio, void *fsdata)
 735 {
 736         struct bch_inode_info *inode = to_bch_ei(mapping->host);
 737         struct bch_fs *c = inode->v.i_sb->s_fs_info;
 738         struct bch2_folio_reservation *res = fsdata;
 739         unsigned offset = pos - folio_pos(folio);
 740
 741         lockdep_assert_held(&inode->v.i_rwsem);
 742         BUG_ON(offset + copied > folio_size(folio));
 743
 744         if (unlikely(copied < len && !folio_test_uptodate(folio))) {
 745                 /*
 746                  * The folio needs to be read in, but that would destroy
 747                  * our partial write - simplest thing is to just force
 748                  * userspace to redo the write:
 749                  */
 750                 folio_zero_range(folio, 0, folio_size(folio));
 751                 flush_dcache_folio(folio);
 752                 copied = 0;
 753         }
 754
 755         spin_lock(&inode->v.i_lock);
 756         if (pos + copied > inode->v.i_size)
 757                 i_size_write(&inode->v, pos + copied);
 758         spin_unlock(&inode->v.i_lock);
 759
 760         if (copied) {
 761                 if (!folio_test_uptodate(folio))
 762                         folio_mark_uptodate(folio);
 763
 764                 bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
 765
 766                 inode->ei_last_dirtied = (unsigned long) current;
 767         }
 768
 769         folio_unlock(folio);
 770         folio_put(folio);
 771         bch2_pagecache_add_put(inode);
 772
 773         bch2_folio_reservation_put(c, inode, res);
 774         kfree(res);
 775
 776         return copied;
 777 }
 778
 779 static noinline void folios_trunc(folios *fs, struct folio **fi)
 780 {
 781         while (fs->data + fs->nr > fi) {
 782                 struct folio *f = darray_pop(fs);
 783
 784                 folio_unlock(f);
 785                 folio_put(f);
 786         }
 787 }
 788
 789 static int __bch2_buffered_write(struct bch_inode_info *inode,
 790                                  struct address_space *mapping,
 791                                  struct iov_iter *iter,
 792                                  loff_t pos, unsigned len)
 793 {
 794         struct bch_fs *c = inode->v.i_sb->s_fs_info;
 795         struct bch2_folio_reservation res;
 796         folios fs;
 797         struct folio *f;
 798         unsigned copied = 0, f_offset, f_copied;
 799         u64 end = pos + len, f_pos, f_len;
 800         loff_t last_folio_pos = inode->v.i_size;
 801         int ret = 0;
 802
 803         BUG_ON(!len);
 804
 805         bch2_folio_reservation_init(c, inode, &res);
 806         darray_init(&fs);
 807
 808         ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
 809                                                FGP_WRITEBEGIN | fgf_set_order(len),
 810                                                mapping_gfp_mask(mapping), &fs);
 811         if (ret)
 812                 goto out;
 813
 814         BUG_ON(!fs.nr);
 815
 816         f = darray_first(fs);
 817         if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 818                 ret = bch2_read_single_folio(f, mapping);
 819                 if (ret)
 820                         goto out;
 821         }
 822
 823         f = darray_last(fs);
 824         end = min(end, folio_end_pos(f));
 825         last_folio_pos = folio_pos(f);
 826         if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
 827                 if (end >= inode->v.i_size) {
 828                         folio_zero_range(f, 0, folio_size(f));
 829                 } else {
 830                         ret = bch2_read_single_folio(f, mapping);
 831                         if (ret)
 832                                 goto out;
 833                 }
 834         }
 835
 836         ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
 837         if (ret)
 838                 goto out;
 839
 840         f_pos = pos;
 841         f_offset = pos - folio_pos(darray_first(fs));
 842         darray_for_each(fs, fi) {
 843                 ssize_t f_reserved;
 844
 845                 f = *fi;
 846                 f_len = min(end, folio_end_pos(f)) - f_pos;
 847                 f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
 848
 849                 if (unlikely(f_reserved != f_len)) {
 850                         if (f_reserved < 0) {
 851                                 if (f == darray_first(fs)) {
 852                                         ret = f_reserved;
 853                                         goto out;
 854                                 }
 855
 856                                 folios_trunc(&fs, fi);
 857                                 end = min(end, folio_end_pos(darray_last(fs)));
 858                         } else {
 859                                 if (!folio_test_uptodate(f)) {
 860                                         ret = bch2_read_single_folio(f, mapping);
 861                                         if (ret)
 862                                                 goto out;
 863                                 }
 864
 865                                 folios_trunc(&fs, fi + 1);
 866                                 end = f_pos + f_reserved;
 867                         }
 868
 869                         break;
 870                 }
 871
 872                 f_pos = folio_end_pos(f);
 873                 f_offset = 0;
 874         }
 875
 876         if (mapping_writably_mapped(mapping))
 877                 darray_for_each(fs, fi)
 878                         flush_dcache_folio(*fi);
 879
 880         f_pos = pos;
 881         f_offset = pos - folio_pos(darray_first(fs));
 882         darray_for_each(fs, fi) {
 883                 f = *fi;
 884                 f_len = min(end, folio_end_pos(f)) - f_pos;
 885                 f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
 886                 if (!f_copied) {
 887                         folios_trunc(&fs, fi);
 888                         break;
 889                 }
 890
 891                 if (!folio_test_uptodate(f) &&
 892                     f_copied != folio_size(f) &&
 893                     pos + copied + f_copied < inode->v.i_size) {
 894                         iov_iter_revert(iter, f_copied);
 895                         folio_zero_range(f, 0, folio_size(f));
 896                         folios_trunc(&fs, fi);
 897                         break;
 898                 }
 899
 900                 flush_dcache_folio(f);
 901                 copied += f_copied;
 902
 903                 if (f_copied != f_len) {
 904                         folios_trunc(&fs, fi + 1);
 905                         break;
 906                 }
 907
 908                 f_pos = folio_end_pos(f);
 909                 f_offset = 0;
 910         }
 911
 912         if (!copied)
 913                 goto out;
 914
 915         end = pos + copied;
 916
 917         spin_lock(&inode->v.i_lock);
 918         if (end > inode->v.i_size)
 919                 i_size_write(&inode->v, end);
 920         spin_unlock(&inode->v.i_lock);
 921
 922         f_pos = pos;
 923         f_offset = pos - folio_pos(darray_first(fs));
 924         darray_for_each(fs, fi) {
 925                 f = *fi;
 926                 f_len = min(end, folio_end_pos(f)) - f_pos;
 927
 928                 if (!folio_test_uptodate(f))
 929                         folio_mark_uptodate(f);
 930
 931                 bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
 932
 933                 f_pos = folio_end_pos(f);
 934                 f_offset = 0;
 935         }
 936
 937         inode->ei_last_dirtied = (unsigned long) current;
 938 out:
 939         darray_for_each(fs, fi) {
 940                 folio_unlock(*fi);
 941                 folio_put(*fi);
 942         }
 943
 944         /*
 945          * If the last folio added to the mapping starts beyond current EOF, we
 946          * performed a short write but left around at least one post-EOF folio.
 947          * Clean up the mapping before we return.
 948          */
 949         if (last_folio_pos >= inode->v.i_size)
 950                 truncate_pagecache(&inode->v, inode->v.i_size);
 951
 952         darray_exit(&fs);
 953         bch2_folio_reservation_put(c, inode, &res);
 954
 955         return copied ?: ret;
 956 }
 957
 958 static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 959 {
 960         struct file *file = iocb->ki_filp;
 961         struct address_space *mapping = file->f_mapping;
 962         struct bch_inode_info *inode = file_bch_inode(file);
 963         loff_t pos = iocb->ki_pos;
 964         ssize_t written = 0;
 965         int ret = 0;
 966
 967         bch2_pagecache_add_get(inode);
 968
 969         do {
 970                 unsigned offset = pos & (PAGE_SIZE - 1);
 971                 unsigned bytes = iov_iter_count(iter);
 972 again:
 973                 /*
 974                  * Bring in the user page that we will copy from _first_.
 975                  * Otherwise there's a nasty deadlock on copying from the
 976                  * same page as we're writing to, without it being marked
 977                  * up-to-date.
 978                  *
 979                  * Not only is this an optimisation, but it is also required
 980                  * to check that the address is actually valid, when atomic
 981                  * usercopies are used, below.
 982                  */
 983                 if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
 984                         bytes = min_t(unsigned long, iov_iter_count(iter),
 985                                       PAGE_SIZE - offset);
 986
 987                         if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
 988                                 ret = -EFAULT;
 989                                 break;
 990                         }
 991                 }
 992
 993                 if (unlikely(fatal_signal_pending(current))) {
 994                         ret = -EINTR;
 995                         break;
 996                 }
 997
 998                 ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
 999                 if (unlikely(ret < 0))
1000                         break;
1001
1002                 cond_resched();
1003
1004                 if (unlikely(ret == 0)) {
1005                         /*
1006                          * If we were unable to copy any data at all, we must
1007                          * fall back to a single segment length write.
1008                          *
1009                          * If we didn't fallback here, we could livelock
1010                          * because not all segments in the iov can be copied at
1011                          * once without a pagefault.
1012                          */
1013                         bytes = min_t(unsigned long, PAGE_SIZE - offset,
1014                                       iov_iter_single_seg_count(iter));
1015                         goto again;
1016                 }
1017                 pos += ret;
1018                 written += ret;
1019                 ret = 0;
1020
1021                 balance_dirty_pages_ratelimited(mapping);
1022         } while (iov_iter_count(iter));
1023
1024         bch2_pagecache_add_put(inode);
1025
1026         return written ? written : ret;
1027 }
1028
1029 ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
1030 {
1031         struct file *file = iocb->ki_filp;
1032         struct bch_inode_info *inode = file_bch_inode(file);
1033         ssize_t ret;
1034
1035         if (iocb->ki_flags & IOCB_DIRECT) {
1036                 ret = bch2_direct_write(iocb, from);
1037                 goto out;
1038         }
1039
1040         inode_lock(&inode->v);
1041
1042         ret = generic_write_checks(iocb, from);
1043         if (ret <= 0)
1044                 goto unlock;
1045
1046         ret = file_remove_privs(file);
1047         if (ret)
1048                 goto unlock;
1049
1050         ret = file_update_time(file);
1051         if (ret)
1052                 goto unlock;
1053
1054         ret = bch2_buffered_write(iocb, from);
1055         if (likely(ret > 0))
1056                 iocb->ki_pos += ret;
1057 unlock:
1058         inode_unlock(&inode->v);
1059
1060         if (ret > 0)
1061                 ret = generic_write_sync(iocb, ret);
1062 out:
1063         return bch2_err_class(ret);
1064 }
1065
1066 void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
1067 {
1068         bioset_exit(&c->writepage_bioset);
1069 }
1070
1071 int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
1072 {
1073         if (bioset_init(&c->writepage_bioset,
1074                         4, offsetof(struct bch_writepage_io, op.wbio.bio),
1075                         BIOSET_NEED_BVECS))
1076                 return -BCH_ERR_ENOMEM_writepage_bioset_init;
1077
1078         return 0;
1079 }
1080
1081 #endif /* NO_BCACHEFS_FS */