fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <linux/vmalloc.h>
  35 #include <asm/div64.h>
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 /*
  51  * set when this rbio is sitting in the hash, but it is just a cache
  52  * of past RMW
  53  */
  54 #define RBIO_CACHE_BIT          2
  55
  56 /*
  57  * set when it is safe to trust the stripe_pages for caching
  58  */
  59 #define RBIO_CACHE_READY_BIT    3
  60
  61 /*
  62  * bbio and raid_map is managed by the caller, so we shouldn't free
  63  * them here. And besides that, all rbios with this flag should not
  64  * be cached, because we need raid_map to check the rbios' stripe
  65  * is the same or not, but it is very likely that the caller has
  66  * free raid_map, so don't cache those rbios.
  67  */
  68 #define RBIO_HOLD_BBIO_MAP_BIT  4
  69
  70 #define RBIO_CACHE_SIZE 1024
  71
  72 enum btrfs_rbio_ops {
  73         BTRFS_RBIO_WRITE        = 0,
  74         BTRFS_RBIO_READ_REBUILD = 1,
  75         BTRFS_RBIO_PARITY_SCRUB = 2,
  76 };
  77
  78 struct btrfs_raid_bio {
  79         struct btrfs_fs_info *fs_info;
  80         struct btrfs_bio *bbio;
  81
  82         /*
  83          * logical block numbers for the start of each stripe
  84          * The last one or two are p/q.  These are sorted,
  85          * so raid_map[0] is the start of our full stripe
  86          */
  87         u64 *raid_map;
  88
  89         /* while we're doing rmw on a stripe
  90          * we put it into a hash table so we can
  91          * lock the stripe and merge more rbios
  92          * into it.
  93          */
  94         struct list_head hash_list;
  95
  96         /*
  97          * LRU list for the stripe cache
  98          */
  99         struct list_head stripe_cache;
 100
 101         /*
 102          * for scheduling work in the helper threads
 103          */
 104         struct btrfs_work work;
 105
 106         /*
 107          * bio list and bio_list_lock are used
 108          * to add more bios into the stripe
 109          * in hopes of avoiding the full rmw
 110          */
 111         struct bio_list bio_list;
 112         spinlock_t bio_list_lock;
 113
 114         /* also protected by the bio_list_lock, the
 115          * plug list is used by the plugging code
 116          * to collect partial bios while plugged.  The
 117          * stripe locking code also uses it to hand off
 118          * the stripe lock to the next pending IO
 119          */
 120         struct list_head plug_list;
 121
 122         /*
 123          * flags that tell us if it is safe to
 124          * merge with this bio
 125          */
 126         unsigned long flags;
 127
 128         /* size of each individual stripe on disk */
 129         int stripe_len;
 130
 131         /* number of data stripes (no p/q) */
 132         int nr_data;
 133
 134         int real_stripes;
 135
 136         int stripe_npages;
 137         /*
 138          * set if we're doing a parity rebuild
 139          * for a read from higher up, which is handled
 140          * differently from a parity rebuild as part of
 141          * rmw
 142          */
 143         enum btrfs_rbio_ops operation;
 144
 145         /* first bad stripe */
 146         int faila;
 147
 148         /* second bad stripe (for raid6 use) */
 149         int failb;
 150
 151         int scrubp;
 152         /*
 153          * number of pages needed to represent the full
 154          * stripe
 155          */
 156         int nr_pages;
 157
 158         /*
 159          * size of all the bios in the bio_list.  This
 160          * helps us decide if the rbio maps to a full
 161          * stripe or not
 162          */
 163         int bio_list_bytes;
 164
 165         int generic_bio_cnt;
 166
 167         atomic_t refs;
 168
 169         atomic_t stripes_pending;
 170
 171         atomic_t error;
 172         /*
 173          * these are two arrays of pointers.  We allocate the
 174          * rbio big enough to hold them both and setup their
 175          * locations when the rbio is allocated
 176          */
 177
 178         /* pointers to pages that we allocated for
 179          * reading/writing stripes directly from the disk (including P/Q)
 180          */
 181         struct page **stripe_pages;
 182
 183         /*
 184          * pointers to the pages in the bio_list.  Stored
 185          * here for faster lookup
 186          */
 187         struct page **bio_pages;
 188
 189         /*
 190          * bitmap to record which horizontal stripe has data
 191          */
 192         unsigned long *dbitmap;
 193 };
 194
 195 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 196 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 197 static void rmw_work(struct btrfs_work *work);
 198 static void read_rebuild_work(struct btrfs_work *work);
 199 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 200 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 201 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 202 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 203 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 204 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 205 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 206
 207 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 208                                          int need_check);
 209 static void async_scrub_parity(struct btrfs_raid_bio *rbio);
 210
 211 /*
 212  * the stripe hash table is used for locking, and to collect
 213  * bios in hopes of making a full stripe
 214  */
 215 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 216 {
 217         struct btrfs_stripe_hash_table *table;
 218         struct btrfs_stripe_hash_table *x;
 219         struct btrfs_stripe_hash *cur;
 220         struct btrfs_stripe_hash *h;
 221         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 222         int i;
 223         int table_size;
 224
 225         if (info->stripe_hash_table)
 226                 return 0;
 227
 228         /*
 229          * The table is large, starting with order 4 and can go as high as
 230          * order 7 in case lock debugging is turned on.
 231          *
 232          * Try harder to allocate and fallback to vmalloc to lower the chance
 233          * of a failing mount.
 234          */
 235         table_size = sizeof(*table) + sizeof(*h) * num_entries;
 236         table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 237         if (!table) {
 238                 table = vzalloc(table_size);
 239                 if (!table)
 240                         return -ENOMEM;
 241         }
 242
 243         spin_lock_init(&table->cache_lock);
 244         INIT_LIST_HEAD(&table->stripe_cache);
 245
 246         h = table->table;
 247
 248         for (i = 0; i < num_entries; i++) {
 249                 cur = h + i;
 250                 INIT_LIST_HEAD(&cur->hash_list);
 251                 spin_lock_init(&cur->lock);
 252                 init_waitqueue_head(&cur->wait);
 253         }
 254
 255         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 256         if (x) {
 257                 if (is_vmalloc_addr(x))
 258                         vfree(x);
 259                 else
 260                         kfree(x);
 261         }
 262         return 0;
 263 }
 264
 265 /*
 266  * caching an rbio means to copy anything from the
 267  * bio_pages array into the stripe_pages array.  We
 268  * use the page uptodate bit in the stripe cache array
 269  * to indicate if it has valid data
 270  *
 271  * once the caching is done, we set the cache ready
 272  * bit.
 273  */
 274 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 275 {
 276         int i;
 277         char *s;
 278         char *d;
 279         int ret;
 280
 281         ret = alloc_rbio_pages(rbio);
 282         if (ret)
 283                 return;
 284
 285         for (i = 0; i < rbio->nr_pages; i++) {
 286                 if (!rbio->bio_pages[i])
 287                         continue;
 288
 289                 s = kmap(rbio->bio_pages[i]);
 290                 d = kmap(rbio->stripe_pages[i]);
 291
 292                 memcpy(d, s, PAGE_CACHE_SIZE);
 293
 294                 kunmap(rbio->bio_pages[i]);
 295                 kunmap(rbio->stripe_pages[i]);
 296                 SetPageUptodate(rbio->stripe_pages[i]);
 297         }
 298         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 299 }
 300
 301 /*
 302  * we hash on the first logical address of the stripe
 303  */
 304 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 305 {
 306         u64 num = rbio->raid_map[0];
 307
 308         /*
 309          * we shift down quite a bit.  We're using byte
 310          * addressing, and most of the lower bits are zeros.
 311          * This tends to upset hash_64, and it consistently
 312          * returns just one or two different values.
 313          *
 314          * shifting off the lower bits fixes things.
 315          */
 316         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 317 }
 318
 319 /*
 320  * stealing an rbio means taking all the uptodate pages from the stripe
 321  * array in the source rbio and putting them into the destination rbio
 322  */
 323 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 324 {
 325         int i;
 326         struct page *s;
 327         struct page *d;
 328
 329         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 330                 return;
 331
 332         for (i = 0; i < dest->nr_pages; i++) {
 333                 s = src->stripe_pages[i];
 334                 if (!s || !PageUptodate(s)) {
 335                         continue;
 336                 }
 337
 338                 d = dest->stripe_pages[i];
 339                 if (d)
 340                         __free_page(d);
 341
 342                 dest->stripe_pages[i] = s;
 343                 src->stripe_pages[i] = NULL;
 344         }
 345 }
 346
 347 /*
 348  * merging means we take the bio_list from the victim and
 349  * splice it into the destination.  The victim should
 350  * be discarded afterwards.
 351  *
 352  * must be called with dest->rbio_list_lock held
 353  */
 354 static void merge_rbio(struct btrfs_raid_bio *dest,
 355                        struct btrfs_raid_bio *victim)
 356 {
 357         bio_list_merge(&dest->bio_list, &victim->bio_list);
 358         dest->bio_list_bytes += victim->bio_list_bytes;
 359         dest->generic_bio_cnt += victim->generic_bio_cnt;
 360         bio_list_init(&victim->bio_list);
 361 }
 362
 363 /*
 364  * used to prune items that are in the cache.  The caller
 365  * must hold the hash table lock.
 366  */
 367 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 368 {
 369         int bucket = rbio_bucket(rbio);
 370         struct btrfs_stripe_hash_table *table;
 371         struct btrfs_stripe_hash *h;
 372         int freeit = 0;
 373
 374         /*
 375          * check the bit again under the hash table lock.
 376          */
 377         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 378                 return;
 379
 380         table = rbio->fs_info->stripe_hash_table;
 381         h = table->table + bucket;
 382
 383         /* hold the lock for the bucket because we may be
 384          * removing it from the hash table
 385          */
 386         spin_lock(&h->lock);
 387
 388         /*
 389          * hold the lock for the bio list because we need
 390          * to make sure the bio list is empty
 391          */
 392         spin_lock(&rbio->bio_list_lock);
 393
 394         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 395                 list_del_init(&rbio->stripe_cache);
 396                 table->cache_size -= 1;
 397                 freeit = 1;
 398
 399                 /* if the bio list isn't empty, this rbio is
 400                  * still involved in an IO.  We take it out
 401                  * of the cache list, and drop the ref that
 402                  * was held for the list.
 403                  *
 404                  * If the bio_list was empty, we also remove
 405                  * the rbio from the hash_table, and drop
 406                  * the corresponding ref
 407                  */
 408                 if (bio_list_empty(&rbio->bio_list)) {
 409                         if (!list_empty(&rbio->hash_list)) {
 410                                 list_del_init(&rbio->hash_list);
 411                                 atomic_dec(&rbio->refs);
 412                                 BUG_ON(!list_empty(&rbio->plug_list));
 413                         }
 414                 }
 415         }
 416
 417         spin_unlock(&rbio->bio_list_lock);
 418         spin_unlock(&h->lock);
 419
 420         if (freeit)
 421                 __free_raid_bio(rbio);
 422 }
 423
 424 /*
 425  * prune a given rbio from the cache
 426  */
 427 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 428 {
 429         struct btrfs_stripe_hash_table *table;
 430         unsigned long flags;
 431
 432         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 433                 return;
 434
 435         table = rbio->fs_info->stripe_hash_table;
 436
 437         spin_lock_irqsave(&table->cache_lock, flags);
 438         __remove_rbio_from_cache(rbio);
 439         spin_unlock_irqrestore(&table->cache_lock, flags);
 440 }
 441
 442 /*
 443  * remove everything in the cache
 444  */
 445 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 446 {
 447         struct btrfs_stripe_hash_table *table;
 448         unsigned long flags;
 449         struct btrfs_raid_bio *rbio;
 450
 451         table = info->stripe_hash_table;
 452
 453         spin_lock_irqsave(&table->cache_lock, flags);
 454         while (!list_empty(&table->stripe_cache)) {
 455                 rbio = list_entry(table->stripe_cache.next,
 456                                   struct btrfs_raid_bio,
 457                                   stripe_cache);
 458                 __remove_rbio_from_cache(rbio);
 459         }
 460         spin_unlock_irqrestore(&table->cache_lock, flags);
 461 }
 462
 463 /*
 464  * remove all cached entries and free the hash table
 465  * used by unmount
 466  */
 467 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 468 {
 469         if (!info->stripe_hash_table)
 470                 return;
 471         btrfs_clear_rbio_cache(info);
 472         if (is_vmalloc_addr(info->stripe_hash_table))
 473                 vfree(info->stripe_hash_table);
 474         else
 475                 kfree(info->stripe_hash_table);
 476         info->stripe_hash_table = NULL;
 477 }
 478
 479 /*
 480  * insert an rbio into the stripe cache.  It
 481  * must have already been prepared by calling
 482  * cache_rbio_pages
 483  *
 484  * If this rbio was already cached, it gets
 485  * moved to the front of the lru.
 486  *
 487  * If the size of the rbio cache is too big, we
 488  * prune an item.
 489  */
 490 static void cache_rbio(struct btrfs_raid_bio *rbio)
 491 {
 492         struct btrfs_stripe_hash_table *table;
 493         unsigned long flags;
 494
 495         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 496                 return;
 497
 498         table = rbio->fs_info->stripe_hash_table;
 499
 500         spin_lock_irqsave(&table->cache_lock, flags);
 501         spin_lock(&rbio->bio_list_lock);
 502
 503         /* bump our ref if we were not in the list before */
 504         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 505                 atomic_inc(&rbio->refs);
 506
 507         if (!list_empty(&rbio->stripe_cache)){
 508                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 509         } else {
 510                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 511                 table->cache_size += 1;
 512         }
 513
 514         spin_unlock(&rbio->bio_list_lock);
 515
 516         if (table->cache_size > RBIO_CACHE_SIZE) {
 517                 struct btrfs_raid_bio *found;
 518
 519                 found = list_entry(table->stripe_cache.prev,
 520                                   struct btrfs_raid_bio,
 521                                   stripe_cache);
 522
 523                 if (found != rbio)
 524                         __remove_rbio_from_cache(found);
 525         }
 526
 527         spin_unlock_irqrestore(&table->cache_lock, flags);
 528         return;
 529 }
 530
 531 /*
 532  * helper function to run the xor_blocks api.  It is only
 533  * able to do MAX_XOR_BLOCKS at a time, so we need to
 534  * loop through.
 535  */
 536 static void run_xor(void **pages, int src_cnt, ssize_t len)
 537 {
 538         int src_off = 0;
 539         int xor_src_cnt = 0;
 540         void *dest = pages[src_cnt];
 541
 542         while(src_cnt > 0) {
 543                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 544                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 545
 546                 src_cnt -= xor_src_cnt;
 547                 src_off += xor_src_cnt;
 548         }
 549 }
 550
 551 /*
 552  * returns true if the bio list inside this rbio
 553  * covers an entire stripe (no rmw required).
 554  * Must be called with the bio list lock held, or
 555  * at a time when you know it is impossible to add
 556  * new bios into the list
 557  */
 558 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 559 {
 560         unsigned long size = rbio->bio_list_bytes;
 561         int ret = 1;
 562
 563         if (size != rbio->nr_data * rbio->stripe_len)
 564                 ret = 0;
 565
 566         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 567         return ret;
 568 }
 569
 570 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 571 {
 572         unsigned long flags;
 573         int ret;
 574
 575         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 576         ret = __rbio_is_full(rbio);
 577         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 578         return ret;
 579 }
 580
 581 /*
 582  * returns 1 if it is safe to merge two rbios together.
 583  * The merging is safe if the two rbios correspond to
 584  * the same stripe and if they are both going in the same
 585  * direction (read vs write), and if neither one is
 586  * locked for final IO
 587  *
 588  * The caller is responsible for locking such that
 589  * rmw_locked is safe to test
 590  */
 591 static int rbio_can_merge(struct btrfs_raid_bio *last,
 592                           struct btrfs_raid_bio *cur)
 593 {
 594         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 595             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 596                 return 0;
 597
 598         /*
 599          * we can't merge with cached rbios, since the
 600          * idea is that when we merge the destination
 601          * rbio is going to run our IO for us.  We can
 602          * steal from cached rbio's though, other functions
 603          * handle that.
 604          */
 605         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 606             test_bit(RBIO_CACHE_BIT, &cur->flags))
 607                 return 0;
 608
 609         if (last->raid_map[0] !=
 610             cur->raid_map[0])
 611                 return 0;
 612
 613         /* we can't merge with different operations */
 614         if (last->operation != cur->operation)
 615                 return 0;
 616         /*
 617          * We've need read the full stripe from the drive.
 618          * check and repair the parity and write the new results.
 619          *
 620          * We're not allowed to add any new bios to the
 621          * bio list here, anyone else that wants to
 622          * change this stripe needs to do their own rmw.
 623          */
 624         if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
 625             cur->operation == BTRFS_RBIO_PARITY_SCRUB)
 626                 return 0;
 627
 628         return 1;
 629 }
 630
 631 /*
 632  * helper to index into the pstripe
 633  */
 634 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 635 {
 636         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 637         return rbio->stripe_pages[index];
 638 }
 639
 640 /*
 641  * helper to index into the qstripe, returns null
 642  * if there is no qstripe
 643  */
 644 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 645 {
 646         if (rbio->nr_data + 1 == rbio->real_stripes)
 647                 return NULL;
 648
 649         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 650                 PAGE_CACHE_SHIFT;
 651         return rbio->stripe_pages[index];
 652 }
 653
 654 /*
 655  * The first stripe in the table for a logical address
 656  * has the lock.  rbios are added in one of three ways:
 657  *
 658  * 1) Nobody has the stripe locked yet.  The rbio is given
 659  * the lock and 0 is returned.  The caller must start the IO
 660  * themselves.
 661  *
 662  * 2) Someone has the stripe locked, but we're able to merge
 663  * with the lock owner.  The rbio is freed and the IO will
 664  * start automatically along with the existing rbio.  1 is returned.
 665  *
 666  * 3) Someone has the stripe locked, but we're not able to merge.
 667  * The rbio is added to the lock owner's plug list, or merged into
 668  * an rbio already on the plug list.  When the lock owner unlocks,
 669  * the next rbio on the list is run and the IO is started automatically.
 670  * 1 is returned
 671  *
 672  * If we return 0, the caller still owns the rbio and must continue with
 673  * IO submission.  If we return 1, the caller must assume the rbio has
 674  * already been freed.
 675  */
 676 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 677 {
 678         int bucket = rbio_bucket(rbio);
 679         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 680         struct btrfs_raid_bio *cur;
 681         struct btrfs_raid_bio *pending;
 682         unsigned long flags;
 683         DEFINE_WAIT(wait);
 684         struct btrfs_raid_bio *freeit = NULL;
 685         struct btrfs_raid_bio *cache_drop = NULL;
 686         int ret = 0;
 687         int walk = 0;
 688
 689         spin_lock_irqsave(&h->lock, flags);
 690         list_for_each_entry(cur, &h->hash_list, hash_list) {
 691                 walk++;
 692                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 693                         spin_lock(&cur->bio_list_lock);
 694
 695                         /* can we steal this cached rbio's pages? */
 696                         if (bio_list_empty(&cur->bio_list) &&
 697                             list_empty(&cur->plug_list) &&
 698                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 699                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 700                                 list_del_init(&cur->hash_list);
 701                                 atomic_dec(&cur->refs);
 702
 703                                 steal_rbio(cur, rbio);
 704                                 cache_drop = cur;
 705                                 spin_unlock(&cur->bio_list_lock);
 706
 707                                 goto lockit;
 708                         }
 709
 710                         /* can we merge into the lock owner? */
 711                         if (rbio_can_merge(cur, rbio)) {
 712                                 merge_rbio(cur, rbio);
 713                                 spin_unlock(&cur->bio_list_lock);
 714                                 freeit = rbio;
 715                                 ret = 1;
 716                                 goto out;
 717                         }
 718
 719
 720                         /*
 721                          * we couldn't merge with the running
 722                          * rbio, see if we can merge with the
 723                          * pending ones.  We don't have to
 724                          * check for rmw_locked because there
 725                          * is no way they are inside finish_rmw
 726                          * right now
 727                          */
 728                         list_for_each_entry(pending, &cur->plug_list,
 729                                             plug_list) {
 730                                 if (rbio_can_merge(pending, rbio)) {
 731                                         merge_rbio(pending, rbio);
 732                                         spin_unlock(&cur->bio_list_lock);
 733                                         freeit = rbio;
 734                                         ret = 1;
 735                                         goto out;
 736                                 }
 737                         }
 738
 739                         /* no merging, put us on the tail of the plug list,
 740                          * our rbio will be started with the currently
 741                          * running rbio unlocks
 742                          */
 743                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 744                         spin_unlock(&cur->bio_list_lock);
 745                         ret = 1;
 746                         goto out;
 747                 }
 748         }
 749 lockit:
 750         atomic_inc(&rbio->refs);
 751         list_add(&rbio->hash_list, &h->hash_list);
 752 out:
 753         spin_unlock_irqrestore(&h->lock, flags);
 754         if (cache_drop)
 755                 remove_rbio_from_cache(cache_drop);
 756         if (freeit)
 757                 __free_raid_bio(freeit);
 758         return ret;
 759 }
 760
 761 /*
 762  * called as rmw or parity rebuild is completed.  If the plug list has more
 763  * rbios waiting for this stripe, the next one on the list will be started
 764  */
 765 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 766 {
 767         int bucket;
 768         struct btrfs_stripe_hash *h;
 769         unsigned long flags;
 770         int keep_cache = 0;
 771
 772         bucket = rbio_bucket(rbio);
 773         h = rbio->fs_info->stripe_hash_table->table + bucket;
 774
 775         if (list_empty(&rbio->plug_list))
 776                 cache_rbio(rbio);
 777
 778         spin_lock_irqsave(&h->lock, flags);
 779         spin_lock(&rbio->bio_list_lock);
 780
 781         if (!list_empty(&rbio->hash_list)) {
 782                 /*
 783                  * if we're still cached and there is no other IO
 784                  * to perform, just leave this rbio here for others
 785                  * to steal from later
 786                  */
 787                 if (list_empty(&rbio->plug_list) &&
 788                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 789                         keep_cache = 1;
 790                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 791                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 792                         goto done;
 793                 }
 794
 795                 list_del_init(&rbio->hash_list);
 796                 atomic_dec(&rbio->refs);
 797
 798                 /*
 799                  * we use the plug list to hold all the rbios
 800                  * waiting for the chance to lock this stripe.
 801                  * hand the lock over to one of them.
 802                  */
 803                 if (!list_empty(&rbio->plug_list)) {
 804                         struct btrfs_raid_bio *next;
 805                         struct list_head *head = rbio->plug_list.next;
 806
 807                         next = list_entry(head, struct btrfs_raid_bio,
 808                                           plug_list);
 809
 810                         list_del_init(&rbio->plug_list);
 811
 812                         list_add(&next->hash_list, &h->hash_list);
 813                         atomic_inc(&next->refs);
 814                         spin_unlock(&rbio->bio_list_lock);
 815                         spin_unlock_irqrestore(&h->lock, flags);
 816
 817                         if (next->operation == BTRFS_RBIO_READ_REBUILD)
 818                                 async_read_rebuild(next);
 819                         else if (next->operation == BTRFS_RBIO_WRITE) {
 820                                 steal_rbio(rbio, next);
 821                                 async_rmw_stripe(next);
 822                         } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 823                                 steal_rbio(rbio, next);
 824                                 async_scrub_parity(next);
 825                         }
 826
 827                         goto done_nolock;
 828                 } else  if (waitqueue_active(&h->wait)) {
 829                         spin_unlock(&rbio->bio_list_lock);
 830                         spin_unlock_irqrestore(&h->lock, flags);
 831                         wake_up(&h->wait);
 832                         goto done_nolock;
 833                 }
 834         }
 835 done:
 836         spin_unlock(&rbio->bio_list_lock);
 837         spin_unlock_irqrestore(&h->lock, flags);
 838
 839 done_nolock:
 840         if (!keep_cache)
 841                 remove_rbio_from_cache(rbio);
 842 }
 843
 844 static inline void
 845 __free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
 846 {
 847         if (need) {
 848                 kfree(raid_map);
 849                 kfree(bbio);
 850         }
 851 }
 852
 853 static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
 854 {
 855         __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
 856                         !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
 857 }
 858
 859 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 860 {
 861         int i;
 862
 863         WARN_ON(atomic_read(&rbio->refs) < 0);
 864         if (!atomic_dec_and_test(&rbio->refs))
 865                 return;
 866
 867         WARN_ON(!list_empty(&rbio->stripe_cache));
 868         WARN_ON(!list_empty(&rbio->hash_list));
 869         WARN_ON(!bio_list_empty(&rbio->bio_list));
 870
 871         for (i = 0; i < rbio->nr_pages; i++) {
 872                 if (rbio->stripe_pages[i]) {
 873                         __free_page(rbio->stripe_pages[i]);
 874                         rbio->stripe_pages[i] = NULL;
 875                 }
 876         }
 877
 878         free_bbio_and_raid_map(rbio);
 879
 880         kfree(rbio);
 881 }
 882
 883 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 884 {
 885         unlock_stripe(rbio);
 886         __free_raid_bio(rbio);
 887 }
 888
 889 /*
 890  * this frees the rbio and runs through all the bios in the
 891  * bio_list and calls end_io on them
 892  */
 893 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 894 {
 895         struct bio *cur = bio_list_get(&rbio->bio_list);
 896         struct bio *next;
 897
 898         if (rbio->generic_bio_cnt)
 899                 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
 900
 901         free_raid_bio(rbio);
 902
 903         while (cur) {
 904                 next = cur->bi_next;
 905                 cur->bi_next = NULL;
 906                 if (uptodate)
 907                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 908                 bio_endio(cur, err);
 909                 cur = next;
 910         }
 911 }
 912
 913 /*
 914  * end io function used by finish_rmw.  When we finally
 915  * get here, we've written a full stripe
 916  */
 917 static void raid_write_end_io(struct bio *bio, int err)
 918 {
 919         struct btrfs_raid_bio *rbio = bio->bi_private;
 920
 921         if (err)
 922                 fail_bio_stripe(rbio, bio);
 923
 924         bio_put(bio);
 925
 926         if (!atomic_dec_and_test(&rbio->stripes_pending))
 927                 return;
 928
 929         err = 0;
 930
 931         /* OK, we have read all the stripes we need to. */
 932         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 933                 err = -EIO;
 934
 935         rbio_orig_end_io(rbio, err, 0);
 936         return;
 937 }
 938
 939 /*
 940  * the read/modify/write code wants to use the original bio for
 941  * any pages it included, and then use the rbio for everything
 942  * else.  This function decides if a given index (stripe number)
 943  * and page number in that stripe fall inside the original bio
 944  * or the rbio.
 945  *
 946  * if you set bio_list_only, you'll get a NULL back for any ranges
 947  * that are outside the bio_list
 948  *
 949  * This doesn't take any refs on anything, you get a bare page pointer
 950  * and the caller must bump refs as required.
 951  *
 952  * You must call index_rbio_pages once before you can trust
 953  * the answers from this function.
 954  */
 955 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 956                                  int index, int pagenr, int bio_list_only)
 957 {
 958         int chunk_page;
 959         struct page *p = NULL;
 960
 961         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 962
 963         spin_lock_irq(&rbio->bio_list_lock);
 964         p = rbio->bio_pages[chunk_page];
 965         spin_unlock_irq(&rbio->bio_list_lock);
 966
 967         if (p || bio_list_only)
 968                 return p;
 969
 970         return rbio->stripe_pages[chunk_page];
 971 }
 972
 973 /*
 974  * number of pages we need for the entire stripe across all the
 975  * drives
 976  */
 977 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 978 {
 979         unsigned long nr = stripe_len * nr_stripes;
 980         return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
 981 }
 982
 983 /*
 984  * allocation and initial setup for the btrfs_raid_bio.  Not
 985  * this does not allocate any pages for rbio->pages.
 986  */
 987 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 988                           struct btrfs_bio *bbio, u64 *raid_map,
 989                           u64 stripe_len)
 990 {
 991         struct btrfs_raid_bio *rbio;
 992         int nr_data = 0;
 993         int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 994         int num_pages = rbio_nr_pages(stripe_len, real_stripes);
 995         int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 996         void *p;
 997
 998         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
 999                        DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
1000                         GFP_NOFS);
1001         if (!rbio)
1002                 return ERR_PTR(-ENOMEM);
1003
1004         bio_list_init(&rbio->bio_list);
1005         INIT_LIST_HEAD(&rbio->plug_list);
1006         spin_lock_init(&rbio->bio_list_lock);
1007         INIT_LIST_HEAD(&rbio->stripe_cache);
1008         INIT_LIST_HEAD(&rbio->hash_list);
1009         rbio->bbio = bbio;
1010         rbio->raid_map = raid_map;
1011         rbio->fs_info = root->fs_info;
1012         rbio->stripe_len = stripe_len;
1013         rbio->nr_pages = num_pages;
1014         rbio->real_stripes = real_stripes;
1015         rbio->stripe_npages = stripe_npages;
1016         rbio->faila = -1;
1017         rbio->failb = -1;
1018         atomic_set(&rbio->refs, 1);
1019         atomic_set(&rbio->error, 0);
1020         atomic_set(&rbio->stripes_pending, 0);
1021
1022         /*
1023          * the stripe_pages and bio_pages array point to the extra
1024          * memory we allocated past the end of the rbio
1025          */
1026         p = rbio + 1;
1027         rbio->stripe_pages = p;
1028         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029         rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
1030
1031         if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
1032                 nr_data = real_stripes - 2;
1033         else
1034                 nr_data = real_stripes - 1;
1035
1036         rbio->nr_data = nr_data;
1037         return rbio;
1038 }
1039
1040 /* allocate pages for all the stripes in the bio, including parity */
1041 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1042 {
1043         int i;
1044         struct page *page;
1045
1046         for (i = 0; i < rbio->nr_pages; i++) {
1047                 if (rbio->stripe_pages[i])
1048                         continue;
1049                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1050                 if (!page)
1051                         return -ENOMEM;
1052                 rbio->stripe_pages[i] = page;
1053                 ClearPageUptodate(page);
1054         }
1055         return 0;
1056 }
1057
1058 /* allocate pages for just the p/q stripes */
1059 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1060 {
1061         int i;
1062         struct page *page;
1063
1064         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
1065
1066         for (; i < rbio->nr_pages; i++) {
1067                 if (rbio->stripe_pages[i])
1068                         continue;
1069                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1070                 if (!page)
1071                         return -ENOMEM;
1072                 rbio->stripe_pages[i] = page;
1073         }
1074         return 0;
1075 }
1076
1077 /*
1078  * add a single page from a specific stripe into our list of bios for IO
1079  * this will try to merge into existing bios if possible, and returns
1080  * zero if all went well.
1081  */
1082 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1083                             struct bio_list *bio_list,
1084                             struct page *page,
1085                             int stripe_nr,
1086                             unsigned long page_index,
1087                             unsigned long bio_max_len)
1088 {
1089         struct bio *last = bio_list->tail;
1090         u64 last_end = 0;
1091         int ret;
1092         struct bio *bio;
1093         struct btrfs_bio_stripe *stripe;
1094         u64 disk_start;
1095
1096         stripe = &rbio->bbio->stripes[stripe_nr];
1097         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1098
1099         /* if the device is missing, just fail this stripe */
1100         if (!stripe->dev->bdev)
1101                 return fail_rbio_index(rbio, stripe_nr);
1102
1103         /* see if we can add this page onto our existing bio */
1104         if (last) {
1105                 last_end = (u64)last->bi_iter.bi_sector << 9;
1106                 last_end += last->bi_iter.bi_size;
1107
1108                 /*
1109                  * we can't merge these if they are from different
1110                  * devices or if they are not contiguous
1111                  */
1112                 if (last_end == disk_start && stripe->dev->bdev &&
1113                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1114                     last->bi_bdev == stripe->dev->bdev) {
1115                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1116                         if (ret == PAGE_CACHE_SIZE)
1117                                 return 0;
1118                 }
1119         }
1120
1121         /* put a new bio on the list */
1122         bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1123         if (!bio)
1124                 return -ENOMEM;
1125
1126         bio->bi_iter.bi_size = 0;
1127         bio->bi_bdev = stripe->dev->bdev;
1128         bio->bi_iter.bi_sector = disk_start >> 9;
1129         set_bit(BIO_UPTODATE, &bio->bi_flags);
1130
1131         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1132         bio_list_add(bio_list, bio);
1133         return 0;
1134 }
1135
1136 /*
1137  * while we're doing the read/modify/write cycle, we could
1138  * have errors in reading pages off the disk.  This checks
1139  * for errors and if we're not able to read the page it'll
1140  * trigger parity reconstruction.  The rmw will be finished
1141  * after we've reconstructed the failed stripes
1142  */
1143 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1144 {
1145         if (rbio->faila >= 0 || rbio->failb >= 0) {
1146                 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1147                 __raid56_parity_recover(rbio);
1148         } else {
1149                 finish_rmw(rbio);
1150         }
1151 }
1152
1153 /*
1154  * these are just the pages from the rbio array, not from anything
1155  * the FS sent down to us
1156  */
1157 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1158 {
1159         int index;
1160         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1161         index += page;
1162         return rbio->stripe_pages[index];
1163 }
1164
1165 /*
1166  * helper function to walk our bio list and populate the bio_pages array with
1167  * the result.  This seems expensive, but it is faster than constantly
1168  * searching through the bio list as we setup the IO in finish_rmw or stripe
1169  * reconstruction.
1170  *
1171  * This must be called before you trust the answers from page_in_rbio
1172  */
1173 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1174 {
1175         struct bio *bio;
1176         u64 start;
1177         unsigned long stripe_offset;
1178         unsigned long page_index;
1179         struct page *p;
1180         int i;
1181
1182         spin_lock_irq(&rbio->bio_list_lock);
1183         bio_list_for_each(bio, &rbio->bio_list) {
1184                 start = (u64)bio->bi_iter.bi_sector << 9;
1185                 stripe_offset = start - rbio->raid_map[0];
1186                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1187
1188                 for (i = 0; i < bio->bi_vcnt; i++) {
1189                         p = bio->bi_io_vec[i].bv_page;
1190                         rbio->bio_pages[page_index + i] = p;
1191                 }
1192         }
1193         spin_unlock_irq(&rbio->bio_list_lock);
1194 }
1195
1196 /*
1197  * this is called from one of two situations.  We either
1198  * have a full stripe from the higher layers, or we've read all
1199  * the missing bits off disk.
1200  *
1201  * This will calculate the parity and then send down any
1202  * changed blocks.
1203  */
1204 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1205 {
1206         struct btrfs_bio *bbio = rbio->bbio;
1207         void *pointers[rbio->real_stripes];
1208         int stripe_len = rbio->stripe_len;
1209         int nr_data = rbio->nr_data;
1210         int stripe;
1211         int pagenr;
1212         int p_stripe = -1;
1213         int q_stripe = -1;
1214         struct bio_list bio_list;
1215         struct bio *bio;
1216         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1217         int ret;
1218
1219         bio_list_init(&bio_list);
1220
1221         if (rbio->real_stripes - rbio->nr_data == 1) {
1222                 p_stripe = rbio->real_stripes - 1;
1223         } else if (rbio->real_stripes - rbio->nr_data == 2) {
1224                 p_stripe = rbio->real_stripes - 2;
1225                 q_stripe = rbio->real_stripes - 1;
1226         } else {
1227                 BUG();
1228         }
1229
1230         /* at this point we either have a full stripe,
1231          * or we've read the full stripe from the drive.
1232          * recalculate the parity and write the new results.
1233          *
1234          * We're not allowed to add any new bios to the
1235          * bio list here, anyone else that wants to
1236          * change this stripe needs to do their own rmw.
1237          */
1238         spin_lock_irq(&rbio->bio_list_lock);
1239         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1240         spin_unlock_irq(&rbio->bio_list_lock);
1241
1242         atomic_set(&rbio->error, 0);
1243
1244         /*
1245          * now that we've set rmw_locked, run through the
1246          * bio list one last time and map the page pointers
1247          *
1248          * We don't cache full rbios because we're assuming
1249          * the higher layers are unlikely to use this area of
1250          * the disk again soon.  If they do use it again,
1251          * hopefully they will send another full bio.
1252          */
1253         index_rbio_pages(rbio);
1254         if (!rbio_is_full(rbio))
1255                 cache_rbio_pages(rbio);
1256         else
1257                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1258
1259         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1260                 struct page *p;
1261                 /* first collect one page from each data stripe */
1262                 for (stripe = 0; stripe < nr_data; stripe++) {
1263                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1264                         pointers[stripe] = kmap(p);
1265                 }
1266
1267                 /* then add the parity stripe */
1268                 p = rbio_pstripe_page(rbio, pagenr);
1269                 SetPageUptodate(p);
1270                 pointers[stripe++] = kmap(p);
1271
1272                 if (q_stripe != -1) {
1273
1274                         /*
1275                          * raid6, add the qstripe and call the
1276                          * library function to fill in our p/q
1277                          */
1278                         p = rbio_qstripe_page(rbio, pagenr);
1279                         SetPageUptodate(p);
1280                         pointers[stripe++] = kmap(p);
1281
1282                         raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1283                                                 pointers);
1284                 } else {
1285                         /* raid5 */
1286                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1287                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1288                 }
1289
1290
1291                 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1292                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1293         }
1294
1295         /*
1296          * time to start writing.  Make bios for everything from the
1297          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1298          * everything else.
1299          */
1300         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1301                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1302                         struct page *page;
1303                         if (stripe < rbio->nr_data) {
1304                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1305                                 if (!page)
1306                                         continue;
1307                         } else {
1308                                page = rbio_stripe_page(rbio, stripe, pagenr);
1309                         }
1310
1311                         ret = rbio_add_io_page(rbio, &bio_list,
1312                                        page, stripe, pagenr, rbio->stripe_len);
1313                         if (ret)
1314                                 goto cleanup;
1315                 }
1316         }
1317
1318         if (likely(!bbio->num_tgtdevs))
1319                 goto write_data;
1320
1321         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1322                 if (!bbio->tgtdev_map[stripe])
1323                         continue;
1324
1325                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1326                         struct page *page;
1327                         if (stripe < rbio->nr_data) {
1328                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1329                                 if (!page)
1330                                         continue;
1331                         } else {
1332                                page = rbio_stripe_page(rbio, stripe, pagenr);
1333                         }
1334
1335                         ret = rbio_add_io_page(rbio, &bio_list, page,
1336                                                rbio->bbio->tgtdev_map[stripe],
1337                                                pagenr, rbio->stripe_len);
1338                         if (ret)
1339                                 goto cleanup;
1340                 }
1341         }
1342
1343 write_data:
1344         atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1345         BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1346
1347         while (1) {
1348                 bio = bio_list_pop(&bio_list);
1349                 if (!bio)
1350                         break;
1351
1352                 bio->bi_private = rbio;
1353                 bio->bi_end_io = raid_write_end_io;
1354                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1355                 submit_bio(WRITE, bio);
1356         }
1357         return;
1358
1359 cleanup:
1360         rbio_orig_end_io(rbio, -EIO, 0);
1361 }
1362
1363 /*
1364  * helper to find the stripe number for a given bio.  Used to figure out which
1365  * stripe has failed.  This expects the bio to correspond to a physical disk,
1366  * so it looks up based on physical sector numbers.
1367  */
1368 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1369                            struct bio *bio)
1370 {
1371         u64 physical = bio->bi_iter.bi_sector;
1372         u64 stripe_start;
1373         int i;
1374         struct btrfs_bio_stripe *stripe;
1375
1376         physical <<= 9;
1377
1378         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1379                 stripe = &rbio->bbio->stripes[i];
1380                 stripe_start = stripe->physical;
1381                 if (physical >= stripe_start &&
1382                     physical < stripe_start + rbio->stripe_len &&
1383                     bio->bi_bdev == stripe->dev->bdev) {
1384                         return i;
1385                 }
1386         }
1387         return -1;
1388 }
1389
1390 /*
1391  * helper to find the stripe number for a given
1392  * bio (before mapping).  Used to figure out which stripe has
1393  * failed.  This looks up based on logical block numbers.
1394  */
1395 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1396                                    struct bio *bio)
1397 {
1398         u64 logical = bio->bi_iter.bi_sector;
1399         u64 stripe_start;
1400         int i;
1401
1402         logical <<= 9;
1403
1404         for (i = 0; i < rbio->nr_data; i++) {
1405                 stripe_start = rbio->raid_map[i];
1406                 if (logical >= stripe_start &&
1407                     logical < stripe_start + rbio->stripe_len) {
1408                         return i;
1409                 }
1410         }
1411         return -1;
1412 }
1413
1414 /*
1415  * returns -EIO if we had too many failures
1416  */
1417 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1418 {
1419         unsigned long flags;
1420         int ret = 0;
1421
1422         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1423
1424         /* we already know this stripe is bad, move on */
1425         if (rbio->faila == failed || rbio->failb == failed)
1426                 goto out;
1427
1428         if (rbio->faila == -1) {
1429                 /* first failure on this rbio */
1430                 rbio->faila = failed;
1431                 atomic_inc(&rbio->error);
1432         } else if (rbio->failb == -1) {
1433                 /* second failure on this rbio */
1434                 rbio->failb = failed;
1435                 atomic_inc(&rbio->error);
1436         } else {
1437                 ret = -EIO;
1438         }
1439 out:
1440         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1441
1442         return ret;
1443 }
1444
1445 /*
1446  * helper to fail a stripe based on a physical disk
1447  * bio.
1448  */
1449 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1450                            struct bio *bio)
1451 {
1452         int failed = find_bio_stripe(rbio, bio);
1453
1454         if (failed < 0)
1455                 return -EIO;
1456
1457         return fail_rbio_index(rbio, failed);
1458 }
1459
1460 /*
1461  * this sets each page in the bio uptodate.  It should only be used on private
1462  * rbio pages, nothing that comes in from the higher layers
1463  */
1464 static void set_bio_pages_uptodate(struct bio *bio)
1465 {
1466         int i;
1467         struct page *p;
1468
1469         for (i = 0; i < bio->bi_vcnt; i++) {
1470                 p = bio->bi_io_vec[i].bv_page;
1471                 SetPageUptodate(p);
1472         }
1473 }
1474
1475 /*
1476  * end io for the read phase of the rmw cycle.  All the bios here are physical
1477  * stripe bios we've read from the disk so we can recalculate the parity of the
1478  * stripe.
1479  *
1480  * This will usually kick off finish_rmw once all the bios are read in, but it
1481  * may trigger parity reconstruction if we had any errors along the way
1482  */
1483 static void raid_rmw_end_io(struct bio *bio, int err)
1484 {
1485         struct btrfs_raid_bio *rbio = bio->bi_private;
1486
1487         if (err)
1488                 fail_bio_stripe(rbio, bio);
1489         else
1490                 set_bio_pages_uptodate(bio);
1491
1492         bio_put(bio);
1493
1494         if (!atomic_dec_and_test(&rbio->stripes_pending))
1495                 return;
1496
1497         err = 0;
1498         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1499                 goto cleanup;
1500
1501         /*
1502          * this will normally call finish_rmw to start our write
1503          * but if there are any failed stripes we'll reconstruct
1504          * from parity first
1505          */
1506         validate_rbio_for_rmw(rbio);
1507         return;
1508
1509 cleanup:
1510
1511         rbio_orig_end_io(rbio, -EIO, 0);
1512 }
1513
1514 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1515 {
1516         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1517                         rmw_work, NULL, NULL);
1518
1519         btrfs_queue_work(rbio->fs_info->rmw_workers,
1520                          &rbio->work);
1521 }
1522
1523 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1524 {
1525         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1526                         read_rebuild_work, NULL, NULL);
1527
1528         btrfs_queue_work(rbio->fs_info->rmw_workers,
1529                          &rbio->work);
1530 }
1531
1532 /*
1533  * the stripe must be locked by the caller.  It will
1534  * unlock after all the writes are done
1535  */
1536 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1537 {
1538         int bios_to_read = 0;
1539         struct bio_list bio_list;
1540         int ret;
1541         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1542         int pagenr;
1543         int stripe;
1544         struct bio *bio;
1545
1546         bio_list_init(&bio_list);
1547
1548         ret = alloc_rbio_pages(rbio);
1549         if (ret)
1550                 goto cleanup;
1551
1552         index_rbio_pages(rbio);
1553
1554         atomic_set(&rbio->error, 0);
1555         /*
1556          * build a list of bios to read all the missing parts of this
1557          * stripe
1558          */
1559         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1560                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1561                         struct page *page;
1562                         /*
1563                          * we want to find all the pages missing from
1564                          * the rbio and read them from the disk.  If
1565                          * page_in_rbio finds a page in the bio list
1566                          * we don't need to read it off the stripe.
1567                          */
1568                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1569                         if (page)
1570                                 continue;
1571
1572                         page = rbio_stripe_page(rbio, stripe, pagenr);
1573                         /*
1574                          * the bio cache may have handed us an uptodate
1575                          * page.  If so, be happy and use it
1576                          */
1577                         if (PageUptodate(page))
1578                                 continue;
1579
1580                         ret = rbio_add_io_page(rbio, &bio_list, page,
1581                                        stripe, pagenr, rbio->stripe_len);
1582                         if (ret)
1583                                 goto cleanup;
1584                 }
1585         }
1586
1587         bios_to_read = bio_list_size(&bio_list);
1588         if (!bios_to_read) {
1589                 /*
1590                  * this can happen if others have merged with
1591                  * us, it means there is nothing left to read.
1592                  * But if there are missing devices it may not be
1593                  * safe to do the full stripe write yet.
1594                  */
1595                 goto finish;
1596         }
1597
1598         /*
1599          * the bbio may be freed once we submit the last bio.  Make sure
1600          * not to touch it after that
1601          */
1602         atomic_set(&rbio->stripes_pending, bios_to_read);
1603         while (1) {
1604                 bio = bio_list_pop(&bio_list);
1605                 if (!bio)
1606                         break;
1607
1608                 bio->bi_private = rbio;
1609                 bio->bi_end_io = raid_rmw_end_io;
1610
1611                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1612                                     BTRFS_WQ_ENDIO_RAID56);
1613
1614                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1615                 submit_bio(READ, bio);
1616         }
1617         /* the actual write will happen once the reads are done */
1618         return 0;
1619
1620 cleanup:
1621         rbio_orig_end_io(rbio, -EIO, 0);
1622         return -EIO;
1623
1624 finish:
1625         validate_rbio_for_rmw(rbio);
1626         return 0;
1627 }
1628
1629 /*
1630  * if the upper layers pass in a full stripe, we thank them by only allocating
1631  * enough pages to hold the parity, and sending it all down quickly.
1632  */
1633 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1634 {
1635         int ret;
1636
1637         ret = alloc_rbio_parity_pages(rbio);
1638         if (ret) {
1639                 __free_raid_bio(rbio);
1640                 return ret;
1641         }
1642
1643         ret = lock_stripe_add(rbio);
1644         if (ret == 0)
1645                 finish_rmw(rbio);
1646         return 0;
1647 }
1648
1649 /*
1650  * partial stripe writes get handed over to async helpers.
1651  * We're really hoping to merge a few more writes into this
1652  * rbio before calculating new parity
1653  */
1654 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1655 {
1656         int ret;
1657
1658         ret = lock_stripe_add(rbio);
1659         if (ret == 0)
1660                 async_rmw_stripe(rbio);
1661         return 0;
1662 }
1663
1664 /*
1665  * sometimes while we were reading from the drive to
1666  * recalculate parity, enough new bios come into create
1667  * a full stripe.  So we do a check here to see if we can
1668  * go directly to finish_rmw
1669  */
1670 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1671 {
1672         /* head off into rmw land if we don't have a full stripe */
1673         if (!rbio_is_full(rbio))
1674                 return partial_stripe_write(rbio);
1675         return full_stripe_write(rbio);
1676 }
1677
1678 /*
1679  * We use plugging call backs to collect full stripes.
1680  * Any time we get a partial stripe write while plugged
1681  * we collect it into a list.  When the unplug comes down,
1682  * we sort the list by logical block number and merge
1683  * everything we can into the same rbios
1684  */
1685 struct btrfs_plug_cb {
1686         struct blk_plug_cb cb;
1687         struct btrfs_fs_info *info;
1688         struct list_head rbio_list;
1689         struct btrfs_work work;
1690 };
1691
1692 /*
1693  * rbios on the plug list are sorted for easier merging.
1694  */
1695 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1696 {
1697         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1698                                                  plug_list);
1699         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1700                                                  plug_list);
1701         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1702         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1703
1704         if (a_sector < b_sector)
1705                 return -1;
1706         if (a_sector > b_sector)
1707                 return 1;
1708         return 0;
1709 }
1710
1711 static void run_plug(struct btrfs_plug_cb *plug)
1712 {
1713         struct btrfs_raid_bio *cur;
1714         struct btrfs_raid_bio *last = NULL;
1715
1716         /*
1717          * sort our plug list then try to merge
1718          * everything we can in hopes of creating full
1719          * stripes.
1720          */
1721         list_sort(NULL, &plug->rbio_list, plug_cmp);
1722         while (!list_empty(&plug->rbio_list)) {
1723                 cur = list_entry(plug->rbio_list.next,
1724                                  struct btrfs_raid_bio, plug_list);
1725                 list_del_init(&cur->plug_list);
1726
1727                 if (rbio_is_full(cur)) {
1728                         /* we have a full stripe, send it down */
1729                         full_stripe_write(cur);
1730                         continue;
1731                 }
1732                 if (last) {
1733                         if (rbio_can_merge(last, cur)) {
1734                                 merge_rbio(last, cur);
1735                                 __free_raid_bio(cur);
1736                                 continue;
1737
1738                         }
1739                         __raid56_parity_write(last);
1740                 }
1741                 last = cur;
1742         }
1743         if (last) {
1744                 __raid56_parity_write(last);
1745         }
1746         kfree(plug);
1747 }
1748
1749 /*
1750  * if the unplug comes from schedule, we have to push the
1751  * work off to a helper thread
1752  */
1753 static void unplug_work(struct btrfs_work *work)
1754 {
1755         struct btrfs_plug_cb *plug;
1756         plug = container_of(work, struct btrfs_plug_cb, work);
1757         run_plug(plug);
1758 }
1759
1760 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1761 {
1762         struct btrfs_plug_cb *plug;
1763         plug = container_of(cb, struct btrfs_plug_cb, cb);
1764
1765         if (from_schedule) {
1766                 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1767                                 unplug_work, NULL, NULL);
1768                 btrfs_queue_work(plug->info->rmw_workers,
1769                                  &plug->work);
1770                 return;
1771         }
1772         run_plug(plug);
1773 }
1774
1775 /*
1776  * our main entry point for writes from the rest of the FS.
1777  */
1778 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1779                         struct btrfs_bio *bbio, u64 *raid_map,
1780                         u64 stripe_len)
1781 {
1782         struct btrfs_raid_bio *rbio;
1783         struct btrfs_plug_cb *plug = NULL;
1784         struct blk_plug_cb *cb;
1785         int ret;
1786
1787         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1788         if (IS_ERR(rbio)) {
1789                 __free_bbio_and_raid_map(bbio, raid_map, 1);
1790                 return PTR_ERR(rbio);
1791         }
1792         bio_list_add(&rbio->bio_list, bio);
1793         rbio->bio_list_bytes = bio->bi_iter.bi_size;
1794         rbio->operation = BTRFS_RBIO_WRITE;
1795
1796         btrfs_bio_counter_inc_noblocked(root->fs_info);
1797         rbio->generic_bio_cnt = 1;
1798
1799         /*
1800          * don't plug on full rbios, just get them out the door
1801          * as quickly as we can
1802          */
1803         if (rbio_is_full(rbio)) {
1804                 ret = full_stripe_write(rbio);
1805                 if (ret)
1806                         btrfs_bio_counter_dec(root->fs_info);
1807                 return ret;
1808         }
1809
1810         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1811                                sizeof(*plug));
1812         if (cb) {
1813                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1814                 if (!plug->info) {
1815                         plug->info = root->fs_info;
1816                         INIT_LIST_HEAD(&plug->rbio_list);
1817                 }
1818                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1819                 ret = 0;
1820         } else {
1821                 ret = __raid56_parity_write(rbio);
1822                 if (ret)
1823                         btrfs_bio_counter_dec(root->fs_info);
1824         }
1825         return ret;
1826 }
1827
1828 /*
1829  * all parity reconstruction happens here.  We've read in everything
1830  * we can find from the drives and this does the heavy lifting of
1831  * sorting the good from the bad.
1832  */
1833 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1834 {
1835         int pagenr, stripe;
1836         void **pointers;
1837         int faila = -1, failb = -1;
1838         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1839         struct page *page;
1840         int err;
1841         int i;
1842
1843         pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1844                            GFP_NOFS);
1845         if (!pointers) {
1846                 err = -ENOMEM;
1847                 goto cleanup_io;
1848         }
1849
1850         faila = rbio->faila;
1851         failb = rbio->failb;
1852
1853         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1854                 spin_lock_irq(&rbio->bio_list_lock);
1855                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1856                 spin_unlock_irq(&rbio->bio_list_lock);
1857         }
1858
1859         index_rbio_pages(rbio);
1860
1861         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1862                 /*
1863                  * Now we just use bitmap to mark the horizontal stripes in
1864                  * which we have data when doing parity scrub.
1865                  */
1866                 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1867                     !test_bit(pagenr, rbio->dbitmap))
1868                         continue;
1869
1870                 /* setup our array of pointers with pages
1871                  * from each stripe
1872                  */
1873                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1874                         /*
1875                          * if we're rebuilding a read, we have to use
1876                          * pages from the bio list
1877                          */
1878                         if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1879                             (stripe == faila || stripe == failb)) {
1880                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1881                         } else {
1882                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1883                         }
1884                         pointers[stripe] = kmap(page);
1885                 }
1886
1887                 /* all raid6 handling here */
1888                 if (rbio->raid_map[rbio->real_stripes - 1] ==
1889                     RAID6_Q_STRIPE) {
1890
1891                         /*
1892                          * single failure, rebuild from parity raid5
1893                          * style
1894                          */
1895                         if (failb < 0) {
1896                                 if (faila == rbio->nr_data) {
1897                                         /*
1898                                          * Just the P stripe has failed, without
1899                                          * a bad data or Q stripe.
1900                                          * TODO, we should redo the xor here.
1901                                          */
1902                                         err = -EIO;
1903                                         goto cleanup;
1904                                 }
1905                                 /*
1906                                  * a single failure in raid6 is rebuilt
1907                                  * in the pstripe code below
1908                                  */
1909                                 goto pstripe;
1910                         }
1911
1912                         /* make sure our ps and qs are in order */
1913                         if (faila > failb) {
1914                                 int tmp = failb;
1915                                 failb = faila;
1916                                 faila = tmp;
1917                         }
1918
1919                         /* if the q stripe is failed, do a pstripe reconstruction
1920                          * from the xors.
1921                          * If both the q stripe and the P stripe are failed, we're
1922                          * here due to a crc mismatch and we can't give them the
1923                          * data they want
1924                          */
1925                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1926                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1927                                         err = -EIO;
1928                                         goto cleanup;
1929                                 }
1930                                 /*
1931                                  * otherwise we have one bad data stripe and
1932                                  * a good P stripe.  raid5!
1933                                  */
1934                                 goto pstripe;
1935                         }
1936
1937                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1938                                 raid6_datap_recov(rbio->real_stripes,
1939                                                   PAGE_SIZE, faila, pointers);
1940                         } else {
1941                                 raid6_2data_recov(rbio->real_stripes,
1942                                                   PAGE_SIZE, faila, failb,
1943                                                   pointers);
1944                         }
1945                 } else {
1946                         void *p;
1947
1948                         /* rebuild from P stripe here (raid5 or raid6) */
1949                         BUG_ON(failb != -1);
1950 pstripe:
1951                         /* Copy parity block into failed block to start with */
1952                         memcpy(pointers[faila],
1953                                pointers[rbio->nr_data],
1954                                PAGE_CACHE_SIZE);
1955
1956                         /* rearrange the pointer array */
1957                         p = pointers[faila];
1958                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1959                                 pointers[stripe] = pointers[stripe + 1];
1960                         pointers[rbio->nr_data - 1] = p;
1961
1962                         /* xor in the rest */
1963                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1964                 }
1965                 /* if we're doing this rebuild as part of an rmw, go through
1966                  * and set all of our private rbio pages in the
1967                  * failed stripes as uptodate.  This way finish_rmw will
1968                  * know they can be trusted.  If this was a read reconstruction,
1969                  * other endio functions will fiddle the uptodate bits
1970                  */
1971                 if (rbio->operation == BTRFS_RBIO_WRITE) {
1972                         for (i = 0;  i < nr_pages; i++) {
1973                                 if (faila != -1) {
1974                                         page = rbio_stripe_page(rbio, faila, i);
1975                                         SetPageUptodate(page);
1976                                 }
1977                                 if (failb != -1) {
1978                                         page = rbio_stripe_page(rbio, failb, i);
1979                                         SetPageUptodate(page);
1980                                 }
1981                         }
1982                 }
1983                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1984                         /*
1985                          * if we're rebuilding a read, we have to use
1986                          * pages from the bio list
1987                          */
1988                         if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1989                             (stripe == faila || stripe == failb)) {
1990                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1991                         } else {
1992                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1993                         }
1994                         kunmap(page);
1995                 }
1996         }
1997
1998         err = 0;
1999 cleanup:
2000         kfree(pointers);
2001
2002 cleanup_io:
2003         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2004                 if (err == 0 &&
2005                     !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
2006                         cache_rbio_pages(rbio);
2007                 else
2008                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2009
2010                 rbio_orig_end_io(rbio, err, err == 0);
2011         } else if (err == 0) {
2012                 rbio->faila = -1;
2013                 rbio->failb = -1;
2014
2015                 if (rbio->operation == BTRFS_RBIO_WRITE)
2016                         finish_rmw(rbio);
2017                 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2018                         finish_parity_scrub(rbio, 0);
2019                 else
2020                         BUG();
2021         } else {
2022                 rbio_orig_end_io(rbio, err, 0);
2023         }
2024 }
2025
2026 /*
2027  * This is called only for stripes we've read from disk to
2028  * reconstruct the parity.
2029  */
2030 static void raid_recover_end_io(struct bio *bio, int err)
2031 {
2032         struct btrfs_raid_bio *rbio = bio->bi_private;
2033
2034         /*
2035          * we only read stripe pages off the disk, set them
2036          * up to date if there were no errors
2037          */
2038         if (err)
2039                 fail_bio_stripe(rbio, bio);
2040         else
2041                 set_bio_pages_uptodate(bio);
2042         bio_put(bio);
2043
2044         if (!atomic_dec_and_test(&rbio->stripes_pending))
2045                 return;
2046
2047         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2048                 rbio_orig_end_io(rbio, -EIO, 0);
2049         else
2050                 __raid_recover_end_io(rbio);
2051 }
2052
2053 /*
2054  * reads everything we need off the disk to reconstruct
2055  * the parity. endio handlers trigger final reconstruction
2056  * when the IO is done.
2057  *
2058  * This is used both for reads from the higher layers and for
2059  * parity construction required to finish a rmw cycle.
2060  */
2061 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2062 {
2063         int bios_to_read = 0;
2064         struct bio_list bio_list;
2065         int ret;
2066         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
2067         int pagenr;
2068         int stripe;
2069         struct bio *bio;
2070
2071         bio_list_init(&bio_list);
2072
2073         ret = alloc_rbio_pages(rbio);
2074         if (ret)
2075                 goto cleanup;
2076
2077         atomic_set(&rbio->error, 0);
2078
2079         /*
2080          * read everything that hasn't failed.  Thanks to the
2081          * stripe cache, it is possible that some or all of these
2082          * pages are going to be uptodate.
2083          */
2084         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2085                 if (rbio->faila == stripe || rbio->failb == stripe) {
2086                         atomic_inc(&rbio->error);
2087                         continue;
2088                 }
2089
2090                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
2091                         struct page *p;
2092
2093                         /*
2094                          * the rmw code may have already read this
2095                          * page in
2096                          */
2097                         p = rbio_stripe_page(rbio, stripe, pagenr);
2098                         if (PageUptodate(p))
2099                                 continue;
2100
2101                         ret = rbio_add_io_page(rbio, &bio_list,
2102                                        rbio_stripe_page(rbio, stripe, pagenr),
2103                                        stripe, pagenr, rbio->stripe_len);
2104                         if (ret < 0)
2105                                 goto cleanup;
2106                 }
2107         }
2108
2109         bios_to_read = bio_list_size(&bio_list);
2110         if (!bios_to_read) {
2111                 /*
2112                  * we might have no bios to read just because the pages
2113                  * were up to date, or we might have no bios to read because
2114                  * the devices were gone.
2115                  */
2116                 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2117                         __raid_recover_end_io(rbio);
2118                         goto out;
2119                 } else {
2120                         goto cleanup;
2121                 }
2122         }
2123
2124         /*
2125          * the bbio may be freed once we submit the last bio.  Make sure
2126          * not to touch it after that
2127          */
2128         atomic_set(&rbio->stripes_pending, bios_to_read);
2129         while (1) {
2130                 bio = bio_list_pop(&bio_list);
2131                 if (!bio)
2132                         break;
2133
2134                 bio->bi_private = rbio;
2135                 bio->bi_end_io = raid_recover_end_io;
2136
2137                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2138                                     BTRFS_WQ_ENDIO_RAID56);
2139
2140                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2141                 submit_bio(READ, bio);
2142         }
2143 out:
2144         return 0;
2145
2146 cleanup:
2147         if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2148                 rbio_orig_end_io(rbio, -EIO, 0);
2149         return -EIO;
2150 }
2151
2152 /*
2153  * the main entry point for reads from the higher layers.  This
2154  * is really only called when the normal read path had a failure,
2155  * so we assume the bio they send down corresponds to a failed part
2156  * of the drive.
2157  */
2158 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2159                           struct btrfs_bio *bbio, u64 *raid_map,
2160                           u64 stripe_len, int mirror_num, int generic_io)
2161 {
2162         struct btrfs_raid_bio *rbio;
2163         int ret;
2164
2165         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2166         if (IS_ERR(rbio)) {
2167                 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2168                 return PTR_ERR(rbio);
2169         }
2170
2171         rbio->operation = BTRFS_RBIO_READ_REBUILD;
2172         bio_list_add(&rbio->bio_list, bio);
2173         rbio->bio_list_bytes = bio->bi_iter.bi_size;
2174
2175         rbio->faila = find_logical_bio_stripe(rbio, bio);
2176         if (rbio->faila == -1) {
2177                 BUG();
2178                 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2179                 kfree(rbio);
2180                 return -EIO;
2181         }
2182
2183         if (generic_io) {
2184                 btrfs_bio_counter_inc_noblocked(root->fs_info);
2185                 rbio->generic_bio_cnt = 1;
2186         } else {
2187                 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2188         }
2189
2190         /*
2191          * reconstruct from the q stripe if they are
2192          * asking for mirror 3
2193          */
2194         if (mirror_num == 3)
2195                 rbio->failb = rbio->real_stripes - 2;
2196
2197         ret = lock_stripe_add(rbio);
2198
2199         /*
2200          * __raid56_parity_recover will end the bio with
2201          * any errors it hits.  We don't want to return
2202          * its error value up the stack because our caller
2203          * will end up calling bio_endio with any nonzero
2204          * return
2205          */
2206         if (ret == 0)
2207                 __raid56_parity_recover(rbio);
2208         /*
2209          * our rbio has been added to the list of
2210          * rbios that will be handled after the
2211          * currently lock owner is done
2212          */
2213         return 0;
2214
2215 }
2216
2217 static void rmw_work(struct btrfs_work *work)
2218 {
2219         struct btrfs_raid_bio *rbio;
2220
2221         rbio = container_of(work, struct btrfs_raid_bio, work);
2222         raid56_rmw_stripe(rbio);
2223 }
2224
2225 static void read_rebuild_work(struct btrfs_work *work)
2226 {
2227         struct btrfs_raid_bio *rbio;
2228
2229         rbio = container_of(work, struct btrfs_raid_bio, work);
2230         __raid56_parity_recover(rbio);
2231 }
2232
2233 /*
2234  * The following code is used to scrub/replace the parity stripe
2235  *
2236  * Note: We need make sure all the pages that add into the scrub/replace
2237  * raid bio are correct and not be changed during the scrub/replace. That
2238  * is those pages just hold metadata or file data with checksum.
2239  */
2240
2241 struct btrfs_raid_bio *
2242 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243                                struct btrfs_bio *bbio, u64 *raid_map,
2244                                u64 stripe_len, struct btrfs_device *scrub_dev,
2245                                unsigned long *dbitmap, int stripe_nsectors)
2246 {
2247         struct btrfs_raid_bio *rbio;
2248         int i;
2249
2250         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2251         if (IS_ERR(rbio))
2252                 return NULL;
2253         bio_list_add(&rbio->bio_list, bio);
2254         /*
2255          * This is a special bio which is used to hold the completion handler
2256          * and make the scrub rbio is similar to the other types
2257          */
2258         ASSERT(!bio->bi_iter.bi_size);
2259         rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2260
2261         for (i = 0; i < rbio->real_stripes; i++) {
2262                 if (bbio->stripes[i].dev == scrub_dev) {
2263                         rbio->scrubp = i;
2264                         break;
2265                 }
2266         }
2267
2268         /* Now we just support the sectorsize equals to page size */
2269         ASSERT(root->sectorsize == PAGE_SIZE);
2270         ASSERT(rbio->stripe_npages == stripe_nsectors);
2271         bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2272
2273         return rbio;
2274 }
2275
2276 void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2277                                    struct page *page, u64 logical)
2278 {
2279         int stripe_offset;
2280         int index;
2281
2282         ASSERT(logical >= rbio->raid_map[0]);
2283         ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
2284                                 rbio->stripe_len * rbio->nr_data);
2285         stripe_offset = (int)(logical - rbio->raid_map[0]);
2286         index = stripe_offset >> PAGE_CACHE_SHIFT;
2287         rbio->bio_pages[index] = page;
2288 }
2289
2290 /*
2291  * We just scrub the parity that we have correct data on the same horizontal,
2292  * so we needn't allocate all pages for all the stripes.
2293  */
2294 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295 {
2296         int i;
2297         int bit;
2298         int index;
2299         struct page *page;
2300
2301         for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2302                 for (i = 0; i < rbio->real_stripes; i++) {
2303                         index = i * rbio->stripe_npages + bit;
2304                         if (rbio->stripe_pages[index])
2305                                 continue;
2306
2307                         page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2308                         if (!page)
2309                                 return -ENOMEM;
2310                         rbio->stripe_pages[index] = page;
2311                         ClearPageUptodate(page);
2312                 }
2313         }
2314         return 0;
2315 }
2316
2317 /*
2318  * end io function used by finish_rmw.  When we finally
2319  * get here, we've written a full stripe
2320  */
2321 static void raid_write_parity_end_io(struct bio *bio, int err)
2322 {
2323         struct btrfs_raid_bio *rbio = bio->bi_private;
2324
2325         if (err)
2326                 fail_bio_stripe(rbio, bio);
2327
2328         bio_put(bio);
2329
2330         if (!atomic_dec_and_test(&rbio->stripes_pending))
2331                 return;
2332
2333         err = 0;
2334
2335         if (atomic_read(&rbio->error))
2336                 err = -EIO;
2337
2338         rbio_orig_end_io(rbio, err, 0);
2339 }
2340
2341 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342                                          int need_check)
2343 {
2344         struct btrfs_bio *bbio = rbio->bbio;
2345         void *pointers[rbio->real_stripes];
2346         DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2347         int nr_data = rbio->nr_data;
2348         int stripe;
2349         int pagenr;
2350         int p_stripe = -1;
2351         int q_stripe = -1;
2352         struct page *p_page = NULL;
2353         struct page *q_page = NULL;
2354         struct bio_list bio_list;
2355         struct bio *bio;
2356         int is_replace = 0;
2357         int ret;
2358
2359         bio_list_init(&bio_list);
2360
2361         if (rbio->real_stripes - rbio->nr_data == 1) {
2362                 p_stripe = rbio->real_stripes - 1;
2363         } else if (rbio->real_stripes - rbio->nr_data == 2) {
2364                 p_stripe = rbio->real_stripes - 2;
2365                 q_stripe = rbio->real_stripes - 1;
2366         } else {
2367                 BUG();
2368         }
2369
2370         if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2371                 is_replace = 1;
2372                 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2373         }
2374
2375         /*
2376          * Because the higher layers(scrubber) are unlikely to
2377          * use this area of the disk again soon, so don't cache
2378          * it.
2379          */
2380         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2381
2382         if (!need_check)
2383                 goto writeback;
2384
2385         p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386         if (!p_page)
2387                 goto cleanup;
2388         SetPageUptodate(p_page);
2389
2390         if (q_stripe != -1) {
2391                 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2392                 if (!q_page) {
2393                         __free_page(p_page);
2394                         goto cleanup;
2395                 }
2396                 SetPageUptodate(q_page);
2397         }
2398
2399         atomic_set(&rbio->error, 0);
2400
2401         for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2402                 struct page *p;
2403                 void *parity;
2404                 /* first collect one page from each data stripe */
2405                 for (stripe = 0; stripe < nr_data; stripe++) {
2406                         p = page_in_rbio(rbio, stripe, pagenr, 0);
2407                         pointers[stripe] = kmap(p);
2408                 }
2409
2410                 /* then add the parity stripe */
2411                 pointers[stripe++] = kmap(p_page);
2412
2413                 if (q_stripe != -1) {
2414
2415                         /*
2416                          * raid6, add the qstripe and call the
2417                          * library function to fill in our p/q
2418                          */
2419                         pointers[stripe++] = kmap(q_page);
2420
2421                         raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2422                                                 pointers);
2423                 } else {
2424                         /* raid5 */
2425                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2426                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2427                 }
2428
2429                 /* Check scrubbing pairty and repair it */
2430                 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2431                 parity = kmap(p);
2432                 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2433                         memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2434                 else
2435                         /* Parity is right, needn't writeback */
2436                         bitmap_clear(rbio->dbitmap, pagenr, 1);
2437                 kunmap(p);
2438
2439                 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2440                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2441         }
2442
2443         __free_page(p_page);
2444         if (q_page)
2445                 __free_page(q_page);
2446
2447 writeback:
2448         /*
2449          * time to start writing.  Make bios for everything from the
2450          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2451          * everything else.
2452          */
2453         for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2454                 struct page *page;
2455
2456                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457                 ret = rbio_add_io_page(rbio, &bio_list,
2458                                page, rbio->scrubp, pagenr, rbio->stripe_len);
2459                 if (ret)
2460                         goto cleanup;
2461         }
2462
2463         if (!is_replace)
2464                 goto submit_write;
2465
2466         for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2467                 struct page *page;
2468
2469                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2470                 ret = rbio_add_io_page(rbio, &bio_list, page,
2471                                        bbio->tgtdev_map[rbio->scrubp],
2472                                        pagenr, rbio->stripe_len);
2473                 if (ret)
2474                         goto cleanup;
2475         }
2476
2477 submit_write:
2478         nr_data = bio_list_size(&bio_list);
2479         if (!nr_data) {
2480                 /* Every parity is right */
2481                 rbio_orig_end_io(rbio, 0, 0);
2482                 return;
2483         }
2484
2485         atomic_set(&rbio->stripes_pending, nr_data);
2486
2487         while (1) {
2488                 bio = bio_list_pop(&bio_list);
2489                 if (!bio)
2490                         break;
2491
2492                 bio->bi_private = rbio;
2493                 bio->bi_end_io = raid_write_parity_end_io;
2494                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2495                 submit_bio(WRITE, bio);
2496         }
2497         return;
2498
2499 cleanup:
2500         rbio_orig_end_io(rbio, -EIO, 0);
2501 }
2502
2503 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2504 {
2505         if (stripe >= 0 && stripe < rbio->nr_data)
2506                 return 1;
2507         return 0;
2508 }
2509
2510 /*
2511  * While we're doing the parity check and repair, we could have errors
2512  * in reading pages off the disk.  This checks for errors and if we're
2513  * not able to read the page it'll trigger parity reconstruction.  The
2514  * parity scrub will be finished after we've reconstructed the failed
2515  * stripes
2516  */
2517 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2518 {
2519         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2520                 goto cleanup;
2521
2522         if (rbio->faila >= 0 || rbio->failb >= 0) {
2523                 int dfail = 0, failp = -1;
2524
2525                 if (is_data_stripe(rbio, rbio->faila))
2526                         dfail++;
2527                 else if (is_parity_stripe(rbio->faila))
2528                         failp = rbio->faila;
2529
2530                 if (is_data_stripe(rbio, rbio->failb))
2531                         dfail++;
2532                 else if (is_parity_stripe(rbio->failb))
2533                         failp = rbio->failb;
2534
2535                 /*
2536                  * Because we can not use a scrubbing parity to repair
2537                  * the data, so the capability of the repair is declined.
2538                  * (In the case of RAID5, we can not repair anything)
2539                  */
2540                 if (dfail > rbio->bbio->max_errors - 1)
2541                         goto cleanup;
2542
2543                 /*
2544                  * If all data is good, only parity is correctly, just
2545                  * repair the parity.
2546                  */
2547                 if (dfail == 0) {
2548                         finish_parity_scrub(rbio, 0);
2549                         return;
2550                 }
2551
2552                 /*
2553                  * Here means we got one corrupted data stripe and one
2554                  * corrupted parity on RAID6, if the corrupted parity
2555                  * is scrubbing parity, luckly, use the other one to repair
2556                  * the data, or we can not repair the data stripe.
2557                  */
2558                 if (failp != rbio->scrubp)
2559                         goto cleanup;
2560
2561                 __raid_recover_end_io(rbio);
2562         } else {
2563                 finish_parity_scrub(rbio, 1);
2564         }
2565         return;
2566
2567 cleanup:
2568         rbio_orig_end_io(rbio, -EIO, 0);
2569 }
2570
2571 /*
2572  * end io for the read phase of the rmw cycle.  All the bios here are physical
2573  * stripe bios we've read from the disk so we can recalculate the parity of the
2574  * stripe.
2575  *
2576  * This will usually kick off finish_rmw once all the bios are read in, but it
2577  * may trigger parity reconstruction if we had any errors along the way
2578  */
2579 static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2580 {
2581         struct btrfs_raid_bio *rbio = bio->bi_private;
2582
2583         if (err)
2584                 fail_bio_stripe(rbio, bio);
2585         else
2586                 set_bio_pages_uptodate(bio);
2587
2588         bio_put(bio);
2589
2590         if (!atomic_dec_and_test(&rbio->stripes_pending))
2591                 return;
2592
2593         /*
2594          * this will normally call finish_rmw to start our write
2595          * but if there are any failed stripes we'll reconstruct
2596          * from parity first
2597          */
2598         validate_rbio_for_parity_scrub(rbio);
2599 }
2600
2601 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2602 {
2603         int bios_to_read = 0;
2604         struct bio_list bio_list;
2605         int ret;
2606         int pagenr;
2607         int stripe;
2608         struct bio *bio;
2609
2610         ret = alloc_rbio_essential_pages(rbio);
2611         if (ret)
2612                 goto cleanup;
2613
2614         bio_list_init(&bio_list);
2615
2616         atomic_set(&rbio->error, 0);
2617         /*
2618          * build a list of bios to read all the missing parts of this
2619          * stripe
2620          */
2621         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2622                 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2623                         struct page *page;
2624                         /*
2625                          * we want to find all the pages missing from
2626                          * the rbio and read them from the disk.  If
2627                          * page_in_rbio finds a page in the bio list
2628                          * we don't need to read it off the stripe.
2629                          */
2630                         page = page_in_rbio(rbio, stripe, pagenr, 1);
2631                         if (page)
2632                                 continue;
2633
2634                         page = rbio_stripe_page(rbio, stripe, pagenr);
2635                         /*
2636                          * the bio cache may have handed us an uptodate
2637                          * page.  If so, be happy and use it
2638                          */
2639                         if (PageUptodate(page))
2640                                 continue;
2641
2642                         ret = rbio_add_io_page(rbio, &bio_list, page,
2643                                        stripe, pagenr, rbio->stripe_len);
2644                         if (ret)
2645                                 goto cleanup;
2646                 }
2647         }
2648
2649         bios_to_read = bio_list_size(&bio_list);
2650         if (!bios_to_read) {
2651                 /*
2652                  * this can happen if others have merged with
2653                  * us, it means there is nothing left to read.
2654                  * But if there are missing devices it may not be
2655                  * safe to do the full stripe write yet.
2656                  */
2657                 goto finish;
2658         }
2659
2660         /*
2661          * the bbio may be freed once we submit the last bio.  Make sure
2662          * not to touch it after that
2663          */
2664         atomic_set(&rbio->stripes_pending, bios_to_read);
2665         while (1) {
2666                 bio = bio_list_pop(&bio_list);
2667                 if (!bio)
2668                         break;
2669
2670                 bio->bi_private = rbio;
2671                 bio->bi_end_io = raid56_parity_scrub_end_io;
2672
2673                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2674                                     BTRFS_WQ_ENDIO_RAID56);
2675
2676                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2677                 submit_bio(READ, bio);
2678         }
2679         /* the actual write will happen once the reads are done */
2680         return;
2681
2682 cleanup:
2683         rbio_orig_end_io(rbio, -EIO, 0);
2684         return;
2685
2686 finish:
2687         validate_rbio_for_parity_scrub(rbio);
2688 }
2689
2690 static void scrub_parity_work(struct btrfs_work *work)
2691 {
2692         struct btrfs_raid_bio *rbio;
2693
2694         rbio = container_of(work, struct btrfs_raid_bio, work);
2695         raid56_parity_scrub_stripe(rbio);
2696 }
2697
2698 static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2699 {
2700         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2701                         scrub_parity_work, NULL, NULL);
2702
2703         btrfs_queue_work(rbio->fs_info->rmw_workers,
2704                          &rbio->work);
2705 }
2706
2707 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2708 {
2709         if (!lock_stripe_add(rbio))
2710                 async_scrub_parity(rbio);
2711 }