fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <linux/vmalloc.h>
  35 #include <asm/div64.h>
  36 #include "compat.h"
  37 #include "ctree.h"
  38 #include "extent_map.h"
  39 #include "disk-io.h"
  40 #include "transaction.h"
  41 #include "print-tree.h"
  42 #include "volumes.h"
  43 #include "raid56.h"
  44 #include "async-thread.h"
  45 #include "check-integrity.h"
  46 #include "rcu-string.h"
  47
  48 /* set when additional merges to this rbio are not allowed */
  49 #define RBIO_RMW_LOCKED_BIT     1
  50
  51 /*
  52  * set when this rbio is sitting in the hash, but it is just a cache
  53  * of past RMW
  54  */
  55 #define RBIO_CACHE_BIT          2
  56
  57 /*
  58  * set when it is safe to trust the stripe_pages for caching
  59  */
  60 #define RBIO_CACHE_READY_BIT    3
  61
  62
  63 #define RBIO_CACHE_SIZE 1024
  64
  65 struct btrfs_raid_bio {
  66         struct btrfs_fs_info *fs_info;
  67         struct btrfs_bio *bbio;
  68
  69         /*
  70          * logical block numbers for the start of each stripe
  71          * The last one or two are p/q.  These are sorted,
  72          * so raid_map[0] is the start of our full stripe
  73          */
  74         u64 *raid_map;
  75
  76         /* while we're doing rmw on a stripe
  77          * we put it into a hash table so we can
  78          * lock the stripe and merge more rbios
  79          * into it.
  80          */
  81         struct list_head hash_list;
  82
  83         /*
  84          * LRU list for the stripe cache
  85          */
  86         struct list_head stripe_cache;
  87
  88         /*
  89          * for scheduling work in the helper threads
  90          */
  91         struct btrfs_work work;
  92
  93         /*
  94          * bio list and bio_list_lock are used
  95          * to add more bios into the stripe
  96          * in hopes of avoiding the full rmw
  97          */
  98         struct bio_list bio_list;
  99         spinlock_t bio_list_lock;
 100
 101         /* also protected by the bio_list_lock, the
 102          * plug list is used by the plugging code
 103          * to collect partial bios while plugged.  The
 104          * stripe locking code also uses it to hand off
 105          * the stripe lock to the next pending IO
 106          */
 107         struct list_head plug_list;
 108
 109         /*
 110          * flags that tell us if it is safe to
 111          * merge with this bio
 112          */
 113         unsigned long flags;
 114
 115         /* size of each individual stripe on disk */
 116         int stripe_len;
 117
 118         /* number of data stripes (no p/q) */
 119         int nr_data;
 120
 121         /*
 122          * set if we're doing a parity rebuild
 123          * for a read from higher up, which is handled
 124          * differently from a parity rebuild as part of
 125          * rmw
 126          */
 127         int read_rebuild;
 128
 129         /* first bad stripe */
 130         int faila;
 131
 132         /* second bad stripe (for raid6 use) */
 133         int failb;
 134
 135         /*
 136          * number of pages needed to represent the full
 137          * stripe
 138          */
 139         int nr_pages;
 140
 141         /*
 142          * size of all the bios in the bio_list.  This
 143          * helps us decide if the rbio maps to a full
 144          * stripe or not
 145          */
 146         int bio_list_bytes;
 147
 148         atomic_t refs;
 149
 150         /*
 151          * these are two arrays of pointers.  We allocate the
 152          * rbio big enough to hold them both and setup their
 153          * locations when the rbio is allocated
 154          */
 155
 156         /* pointers to pages that we allocated for
 157          * reading/writing stripes directly from the disk (including P/Q)
 158          */
 159         struct page **stripe_pages;
 160
 161         /*
 162          * pointers to the pages in the bio_list.  Stored
 163          * here for faster lookup
 164          */
 165         struct page **bio_pages;
 166 };
 167
 168 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 169 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 170 static void rmw_work(struct btrfs_work *work);
 171 static void read_rebuild_work(struct btrfs_work *work);
 172 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 173 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 174 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 175 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 176 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 177 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 178 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 179
 180 /*
 181  * the stripe hash table is used for locking, and to collect
 182  * bios in hopes of making a full stripe
 183  */
 184 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 185 {
 186         struct btrfs_stripe_hash_table *table;
 187         struct btrfs_stripe_hash_table *x;
 188         struct btrfs_stripe_hash *cur;
 189         struct btrfs_stripe_hash *h;
 190         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 191         int i;
 192         int table_size;
 193
 194         if (info->stripe_hash_table)
 195                 return 0;
 196
 197         /*
 198          * The table is large, starting with order 4 and can go as high as
 199          * order 7 in case lock debugging is turned on.
 200          *
 201          * Try harder to allocate and fallback to vmalloc to lower the chance
 202          * of a failing mount.
 203          */
 204         table_size = sizeof(*table) + sizeof(*h) * num_entries;
 205         table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 206         if (!table) {
 207                 table = vzalloc(table_size);
 208                 if (!table)
 209                         return -ENOMEM;
 210         }
 211
 212         spin_lock_init(&table->cache_lock);
 213         INIT_LIST_HEAD(&table->stripe_cache);
 214
 215         h = table->table;
 216
 217         for (i = 0; i < num_entries; i++) {
 218                 cur = h + i;
 219                 INIT_LIST_HEAD(&cur->hash_list);
 220                 spin_lock_init(&cur->lock);
 221                 init_waitqueue_head(&cur->wait);
 222         }
 223
 224         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 225         if (x) {
 226                 if (is_vmalloc_addr(x))
 227                         vfree(x);
 228                 else
 229                         kfree(x);
 230         }
 231         return 0;
 232 }
 233
 234 /*
 235  * caching an rbio means to copy anything from the
 236  * bio_pages array into the stripe_pages array.  We
 237  * use the page uptodate bit in the stripe cache array
 238  * to indicate if it has valid data
 239  *
 240  * once the caching is done, we set the cache ready
 241  * bit.
 242  */
 243 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 244 {
 245         int i;
 246         char *s;
 247         char *d;
 248         int ret;
 249
 250         ret = alloc_rbio_pages(rbio);
 251         if (ret)
 252                 return;
 253
 254         for (i = 0; i < rbio->nr_pages; i++) {
 255                 if (!rbio->bio_pages[i])
 256                         continue;
 257
 258                 s = kmap(rbio->bio_pages[i]);
 259                 d = kmap(rbio->stripe_pages[i]);
 260
 261                 memcpy(d, s, PAGE_CACHE_SIZE);
 262
 263                 kunmap(rbio->bio_pages[i]);
 264                 kunmap(rbio->stripe_pages[i]);
 265                 SetPageUptodate(rbio->stripe_pages[i]);
 266         }
 267         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 268 }
 269
 270 /*
 271  * we hash on the first logical address of the stripe
 272  */
 273 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 274 {
 275         u64 num = rbio->raid_map[0];
 276
 277         /*
 278          * we shift down quite a bit.  We're using byte
 279          * addressing, and most of the lower bits are zeros.
 280          * This tends to upset hash_64, and it consistently
 281          * returns just one or two different values.
 282          *
 283          * shifting off the lower bits fixes things.
 284          */
 285         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 286 }
 287
 288 /*
 289  * stealing an rbio means taking all the uptodate pages from the stripe
 290  * array in the source rbio and putting them into the destination rbio
 291  */
 292 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 293 {
 294         int i;
 295         struct page *s;
 296         struct page *d;
 297
 298         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 299                 return;
 300
 301         for (i = 0; i < dest->nr_pages; i++) {
 302                 s = src->stripe_pages[i];
 303                 if (!s || !PageUptodate(s)) {
 304                         continue;
 305                 }
 306
 307                 d = dest->stripe_pages[i];
 308                 if (d)
 309                         __free_page(d);
 310
 311                 dest->stripe_pages[i] = s;
 312                 src->stripe_pages[i] = NULL;
 313         }
 314 }
 315
 316 /*
 317  * merging means we take the bio_list from the victim and
 318  * splice it into the destination.  The victim should
 319  * be discarded afterwards.
 320  *
 321  * must be called with dest->rbio_list_lock held
 322  */
 323 static void merge_rbio(struct btrfs_raid_bio *dest,
 324                        struct btrfs_raid_bio *victim)
 325 {
 326         bio_list_merge(&dest->bio_list, &victim->bio_list);
 327         dest->bio_list_bytes += victim->bio_list_bytes;
 328         bio_list_init(&victim->bio_list);
 329 }
 330
 331 /*
 332  * used to prune items that are in the cache.  The caller
 333  * must hold the hash table lock.
 334  */
 335 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 336 {
 337         int bucket = rbio_bucket(rbio);
 338         struct btrfs_stripe_hash_table *table;
 339         struct btrfs_stripe_hash *h;
 340         int freeit = 0;
 341
 342         /*
 343          * check the bit again under the hash table lock.
 344          */
 345         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 346                 return;
 347
 348         table = rbio->fs_info->stripe_hash_table;
 349         h = table->table + bucket;
 350
 351         /* hold the lock for the bucket because we may be
 352          * removing it from the hash table
 353          */
 354         spin_lock(&h->lock);
 355
 356         /*
 357          * hold the lock for the bio list because we need
 358          * to make sure the bio list is empty
 359          */
 360         spin_lock(&rbio->bio_list_lock);
 361
 362         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 363                 list_del_init(&rbio->stripe_cache);
 364                 table->cache_size -= 1;
 365                 freeit = 1;
 366
 367                 /* if the bio list isn't empty, this rbio is
 368                  * still involved in an IO.  We take it out
 369                  * of the cache list, and drop the ref that
 370                  * was held for the list.
 371                  *
 372                  * If the bio_list was empty, we also remove
 373                  * the rbio from the hash_table, and drop
 374                  * the corresponding ref
 375                  */
 376                 if (bio_list_empty(&rbio->bio_list)) {
 377                         if (!list_empty(&rbio->hash_list)) {
 378                                 list_del_init(&rbio->hash_list);
 379                                 atomic_dec(&rbio->refs);
 380                                 BUG_ON(!list_empty(&rbio->plug_list));
 381                         }
 382                 }
 383         }
 384
 385         spin_unlock(&rbio->bio_list_lock);
 386         spin_unlock(&h->lock);
 387
 388         if (freeit)
 389                 __free_raid_bio(rbio);
 390 }
 391
 392 /*
 393  * prune a given rbio from the cache
 394  */
 395 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 396 {
 397         struct btrfs_stripe_hash_table *table;
 398         unsigned long flags;
 399
 400         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 401                 return;
 402
 403         table = rbio->fs_info->stripe_hash_table;
 404
 405         spin_lock_irqsave(&table->cache_lock, flags);
 406         __remove_rbio_from_cache(rbio);
 407         spin_unlock_irqrestore(&table->cache_lock, flags);
 408 }
 409
 410 /*
 411  * remove everything in the cache
 412  */
 413 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 414 {
 415         struct btrfs_stripe_hash_table *table;
 416         unsigned long flags;
 417         struct btrfs_raid_bio *rbio;
 418
 419         table = info->stripe_hash_table;
 420
 421         spin_lock_irqsave(&table->cache_lock, flags);
 422         while (!list_empty(&table->stripe_cache)) {
 423                 rbio = list_entry(table->stripe_cache.next,
 424                                   struct btrfs_raid_bio,
 425                                   stripe_cache);
 426                 __remove_rbio_from_cache(rbio);
 427         }
 428         spin_unlock_irqrestore(&table->cache_lock, flags);
 429 }
 430
 431 /*
 432  * remove all cached entries and free the hash table
 433  * used by unmount
 434  */
 435 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 436 {
 437         if (!info->stripe_hash_table)
 438                 return;
 439         btrfs_clear_rbio_cache(info);
 440         if (is_vmalloc_addr(info->stripe_hash_table))
 441                 vfree(info->stripe_hash_table);
 442         else
 443                 kfree(info->stripe_hash_table);
 444         info->stripe_hash_table = NULL;
 445 }
 446
 447 /*
 448  * insert an rbio into the stripe cache.  It
 449  * must have already been prepared by calling
 450  * cache_rbio_pages
 451  *
 452  * If this rbio was already cached, it gets
 453  * moved to the front of the lru.
 454  *
 455  * If the size of the rbio cache is too big, we
 456  * prune an item.
 457  */
 458 static void cache_rbio(struct btrfs_raid_bio *rbio)
 459 {
 460         struct btrfs_stripe_hash_table *table;
 461         unsigned long flags;
 462
 463         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 464                 return;
 465
 466         table = rbio->fs_info->stripe_hash_table;
 467
 468         spin_lock_irqsave(&table->cache_lock, flags);
 469         spin_lock(&rbio->bio_list_lock);
 470
 471         /* bump our ref if we were not in the list before */
 472         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 473                 atomic_inc(&rbio->refs);
 474
 475         if (!list_empty(&rbio->stripe_cache)){
 476                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 477         } else {
 478                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 479                 table->cache_size += 1;
 480         }
 481
 482         spin_unlock(&rbio->bio_list_lock);
 483
 484         if (table->cache_size > RBIO_CACHE_SIZE) {
 485                 struct btrfs_raid_bio *found;
 486
 487                 found = list_entry(table->stripe_cache.prev,
 488                                   struct btrfs_raid_bio,
 489                                   stripe_cache);
 490
 491                 if (found != rbio)
 492                         __remove_rbio_from_cache(found);
 493         }
 494
 495         spin_unlock_irqrestore(&table->cache_lock, flags);
 496         return;
 497 }
 498
 499 /*
 500  * helper function to run the xor_blocks api.  It is only
 501  * able to do MAX_XOR_BLOCKS at a time, so we need to
 502  * loop through.
 503  */
 504 static void run_xor(void **pages, int src_cnt, ssize_t len)
 505 {
 506         int src_off = 0;
 507         int xor_src_cnt = 0;
 508         void *dest = pages[src_cnt];
 509
 510         while(src_cnt > 0) {
 511                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 512                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 513
 514                 src_cnt -= xor_src_cnt;
 515                 src_off += xor_src_cnt;
 516         }
 517 }
 518
 519 /*
 520  * returns true if the bio list inside this rbio
 521  * covers an entire stripe (no rmw required).
 522  * Must be called with the bio list lock held, or
 523  * at a time when you know it is impossible to add
 524  * new bios into the list
 525  */
 526 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 527 {
 528         unsigned long size = rbio->bio_list_bytes;
 529         int ret = 1;
 530
 531         if (size != rbio->nr_data * rbio->stripe_len)
 532                 ret = 0;
 533
 534         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 535         return ret;
 536 }
 537
 538 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 539 {
 540         unsigned long flags;
 541         int ret;
 542
 543         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 544         ret = __rbio_is_full(rbio);
 545         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 546         return ret;
 547 }
 548
 549 /*
 550  * returns 1 if it is safe to merge two rbios together.
 551  * The merging is safe if the two rbios correspond to
 552  * the same stripe and if they are both going in the same
 553  * direction (read vs write), and if neither one is
 554  * locked for final IO
 555  *
 556  * The caller is responsible for locking such that
 557  * rmw_locked is safe to test
 558  */
 559 static int rbio_can_merge(struct btrfs_raid_bio *last,
 560                           struct btrfs_raid_bio *cur)
 561 {
 562         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 563             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 564                 return 0;
 565
 566         /*
 567          * we can't merge with cached rbios, since the
 568          * idea is that when we merge the destination
 569          * rbio is going to run our IO for us.  We can
 570          * steal from cached rbio's though, other functions
 571          * handle that.
 572          */
 573         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 574             test_bit(RBIO_CACHE_BIT, &cur->flags))
 575                 return 0;
 576
 577         if (last->raid_map[0] !=
 578             cur->raid_map[0])
 579                 return 0;
 580
 581         /* reads can't merge with writes */
 582         if (last->read_rebuild !=
 583             cur->read_rebuild) {
 584                 return 0;
 585         }
 586
 587         return 1;
 588 }
 589
 590 /*
 591  * helper to index into the pstripe
 592  */
 593 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 594 {
 595         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 596         return rbio->stripe_pages[index];
 597 }
 598
 599 /*
 600  * helper to index into the qstripe, returns null
 601  * if there is no qstripe
 602  */
 603 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 604 {
 605         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 606                 return NULL;
 607
 608         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 609                 PAGE_CACHE_SHIFT;
 610         return rbio->stripe_pages[index];
 611 }
 612
 613 /*
 614  * The first stripe in the table for a logical address
 615  * has the lock.  rbios are added in one of three ways:
 616  *
 617  * 1) Nobody has the stripe locked yet.  The rbio is given
 618  * the lock and 0 is returned.  The caller must start the IO
 619  * themselves.
 620  *
 621  * 2) Someone has the stripe locked, but we're able to merge
 622  * with the lock owner.  The rbio is freed and the IO will
 623  * start automatically along with the existing rbio.  1 is returned.
 624  *
 625  * 3) Someone has the stripe locked, but we're not able to merge.
 626  * The rbio is added to the lock owner's plug list, or merged into
 627  * an rbio already on the plug list.  When the lock owner unlocks,
 628  * the next rbio on the list is run and the IO is started automatically.
 629  * 1 is returned
 630  *
 631  * If we return 0, the caller still owns the rbio and must continue with
 632  * IO submission.  If we return 1, the caller must assume the rbio has
 633  * already been freed.
 634  */
 635 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 636 {
 637         int bucket = rbio_bucket(rbio);
 638         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 639         struct btrfs_raid_bio *cur;
 640         struct btrfs_raid_bio *pending;
 641         unsigned long flags;
 642         DEFINE_WAIT(wait);
 643         struct btrfs_raid_bio *freeit = NULL;
 644         struct btrfs_raid_bio *cache_drop = NULL;
 645         int ret = 0;
 646         int walk = 0;
 647
 648         spin_lock_irqsave(&h->lock, flags);
 649         list_for_each_entry(cur, &h->hash_list, hash_list) {
 650                 walk++;
 651                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 652                         spin_lock(&cur->bio_list_lock);
 653
 654                         /* can we steal this cached rbio's pages? */
 655                         if (bio_list_empty(&cur->bio_list) &&
 656                             list_empty(&cur->plug_list) &&
 657                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 658                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 659                                 list_del_init(&cur->hash_list);
 660                                 atomic_dec(&cur->refs);
 661
 662                                 steal_rbio(cur, rbio);
 663                                 cache_drop = cur;
 664                                 spin_unlock(&cur->bio_list_lock);
 665
 666                                 goto lockit;
 667                         }
 668
 669                         /* can we merge into the lock owner? */
 670                         if (rbio_can_merge(cur, rbio)) {
 671                                 merge_rbio(cur, rbio);
 672                                 spin_unlock(&cur->bio_list_lock);
 673                                 freeit = rbio;
 674                                 ret = 1;
 675                                 goto out;
 676                         }
 677
 678
 679                         /*
 680                          * we couldn't merge with the running
 681                          * rbio, see if we can merge with the
 682                          * pending ones.  We don't have to
 683                          * check for rmw_locked because there
 684                          * is no way they are inside finish_rmw
 685                          * right now
 686                          */
 687                         list_for_each_entry(pending, &cur->plug_list,
 688                                             plug_list) {
 689                                 if (rbio_can_merge(pending, rbio)) {
 690                                         merge_rbio(pending, rbio);
 691                                         spin_unlock(&cur->bio_list_lock);
 692                                         freeit = rbio;
 693                                         ret = 1;
 694                                         goto out;
 695                                 }
 696                         }
 697
 698                         /* no merging, put us on the tail of the plug list,
 699                          * our rbio will be started with the currently
 700                          * running rbio unlocks
 701                          */
 702                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 703                         spin_unlock(&cur->bio_list_lock);
 704                         ret = 1;
 705                         goto out;
 706                 }
 707         }
 708 lockit:
 709         atomic_inc(&rbio->refs);
 710         list_add(&rbio->hash_list, &h->hash_list);
 711 out:
 712         spin_unlock_irqrestore(&h->lock, flags);
 713         if (cache_drop)
 714                 remove_rbio_from_cache(cache_drop);
 715         if (freeit)
 716                 __free_raid_bio(freeit);
 717         return ret;
 718 }
 719
 720 /*
 721  * called as rmw or parity rebuild is completed.  If the plug list has more
 722  * rbios waiting for this stripe, the next one on the list will be started
 723  */
 724 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 725 {
 726         int bucket;
 727         struct btrfs_stripe_hash *h;
 728         unsigned long flags;
 729         int keep_cache = 0;
 730
 731         bucket = rbio_bucket(rbio);
 732         h = rbio->fs_info->stripe_hash_table->table + bucket;
 733
 734         if (list_empty(&rbio->plug_list))
 735                 cache_rbio(rbio);
 736
 737         spin_lock_irqsave(&h->lock, flags);
 738         spin_lock(&rbio->bio_list_lock);
 739
 740         if (!list_empty(&rbio->hash_list)) {
 741                 /*
 742                  * if we're still cached and there is no other IO
 743                  * to perform, just leave this rbio here for others
 744                  * to steal from later
 745                  */
 746                 if (list_empty(&rbio->plug_list) &&
 747                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 748                         keep_cache = 1;
 749                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 750                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 751                         goto done;
 752                 }
 753
 754                 list_del_init(&rbio->hash_list);
 755                 atomic_dec(&rbio->refs);
 756
 757                 /*
 758                  * we use the plug list to hold all the rbios
 759                  * waiting for the chance to lock this stripe.
 760                  * hand the lock over to one of them.
 761                  */
 762                 if (!list_empty(&rbio->plug_list)) {
 763                         struct btrfs_raid_bio *next;
 764                         struct list_head *head = rbio->plug_list.next;
 765
 766                         next = list_entry(head, struct btrfs_raid_bio,
 767                                           plug_list);
 768
 769                         list_del_init(&rbio->plug_list);
 770
 771                         list_add(&next->hash_list, &h->hash_list);
 772                         atomic_inc(&next->refs);
 773                         spin_unlock(&rbio->bio_list_lock);
 774                         spin_unlock_irqrestore(&h->lock, flags);
 775
 776                         if (next->read_rebuild)
 777                                 async_read_rebuild(next);
 778                         else {
 779                                 steal_rbio(rbio, next);
 780                                 async_rmw_stripe(next);
 781                         }
 782
 783                         goto done_nolock;
 784                 } else  if (waitqueue_active(&h->wait)) {
 785                         spin_unlock(&rbio->bio_list_lock);
 786                         spin_unlock_irqrestore(&h->lock, flags);
 787                         wake_up(&h->wait);
 788                         goto done_nolock;
 789                 }
 790         }
 791 done:
 792         spin_unlock(&rbio->bio_list_lock);
 793         spin_unlock_irqrestore(&h->lock, flags);
 794
 795 done_nolock:
 796         if (!keep_cache)
 797                 remove_rbio_from_cache(rbio);
 798 }
 799
 800 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 801 {
 802         int i;
 803
 804         WARN_ON(atomic_read(&rbio->refs) < 0);
 805         if (!atomic_dec_and_test(&rbio->refs))
 806                 return;
 807
 808         WARN_ON(!list_empty(&rbio->stripe_cache));
 809         WARN_ON(!list_empty(&rbio->hash_list));
 810         WARN_ON(!bio_list_empty(&rbio->bio_list));
 811
 812         for (i = 0; i < rbio->nr_pages; i++) {
 813                 if (rbio->stripe_pages[i]) {
 814                         __free_page(rbio->stripe_pages[i]);
 815                         rbio->stripe_pages[i] = NULL;
 816                 }
 817         }
 818         kfree(rbio->raid_map);
 819         kfree(rbio->bbio);
 820         kfree(rbio);
 821 }
 822
 823 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 824 {
 825         unlock_stripe(rbio);
 826         __free_raid_bio(rbio);
 827 }
 828
 829 /*
 830  * this frees the rbio and runs through all the bios in the
 831  * bio_list and calls end_io on them
 832  */
 833 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 834 {
 835         struct bio *cur = bio_list_get(&rbio->bio_list);
 836         struct bio *next;
 837         free_raid_bio(rbio);
 838
 839         while (cur) {
 840                 next = cur->bi_next;
 841                 cur->bi_next = NULL;
 842                 if (uptodate)
 843                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 844                 bio_endio(cur, err);
 845                 cur = next;
 846         }
 847 }
 848
 849 /*
 850  * end io function used by finish_rmw.  When we finally
 851  * get here, we've written a full stripe
 852  */
 853 static void raid_write_end_io(struct bio *bio, int err)
 854 {
 855         struct btrfs_raid_bio *rbio = bio->bi_private;
 856
 857         if (err)
 858                 fail_bio_stripe(rbio, bio);
 859
 860         bio_put(bio);
 861
 862         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
 863                 return;
 864
 865         err = 0;
 866
 867         /* OK, we have read all the stripes we need to. */
 868         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
 869                 err = -EIO;
 870
 871         rbio_orig_end_io(rbio, err, 0);
 872         return;
 873 }
 874
 875 /*
 876  * the read/modify/write code wants to use the original bio for
 877  * any pages it included, and then use the rbio for everything
 878  * else.  This function decides if a given index (stripe number)
 879  * and page number in that stripe fall inside the original bio
 880  * or the rbio.
 881  *
 882  * if you set bio_list_only, you'll get a NULL back for any ranges
 883  * that are outside the bio_list
 884  *
 885  * This doesn't take any refs on anything, you get a bare page pointer
 886  * and the caller must bump refs as required.
 887  *
 888  * You must call index_rbio_pages once before you can trust
 889  * the answers from this function.
 890  */
 891 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 892                                  int index, int pagenr, int bio_list_only)
 893 {
 894         int chunk_page;
 895         struct page *p = NULL;
 896
 897         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 898
 899         spin_lock_irq(&rbio->bio_list_lock);
 900         p = rbio->bio_pages[chunk_page];
 901         spin_unlock_irq(&rbio->bio_list_lock);
 902
 903         if (p || bio_list_only)
 904                 return p;
 905
 906         return rbio->stripe_pages[chunk_page];
 907 }
 908
 909 /*
 910  * number of pages we need for the entire stripe across all the
 911  * drives
 912  */
 913 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 914 {
 915         unsigned long nr = stripe_len * nr_stripes;
 916         return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 917 }
 918
 919 /*
 920  * allocation and initial setup for the btrfs_raid_bio.  Not
 921  * this does not allocate any pages for rbio->pages.
 922  */
 923 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 924                           struct btrfs_bio *bbio, u64 *raid_map,
 925                           u64 stripe_len)
 926 {
 927         struct btrfs_raid_bio *rbio;
 928         int nr_data = 0;
 929         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 930         void *p;
 931
 932         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 933                         GFP_NOFS);
 934         if (!rbio) {
 935                 kfree(raid_map);
 936                 kfree(bbio);
 937                 return ERR_PTR(-ENOMEM);
 938         }
 939
 940         bio_list_init(&rbio->bio_list);
 941         INIT_LIST_HEAD(&rbio->plug_list);
 942         spin_lock_init(&rbio->bio_list_lock);
 943         INIT_LIST_HEAD(&rbio->stripe_cache);
 944         INIT_LIST_HEAD(&rbio->hash_list);
 945         rbio->bbio = bbio;
 946         rbio->raid_map = raid_map;
 947         rbio->fs_info = root->fs_info;
 948         rbio->stripe_len = stripe_len;
 949         rbio->nr_pages = num_pages;
 950         rbio->faila = -1;
 951         rbio->failb = -1;
 952         atomic_set(&rbio->refs, 1);
 953
 954         /*
 955          * the stripe_pages and bio_pages array point to the extra
 956          * memory we allocated past the end of the rbio
 957          */
 958         p = rbio + 1;
 959         rbio->stripe_pages = p;
 960         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 961
 962         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 963                 nr_data = bbio->num_stripes - 2;
 964         else
 965                 nr_data = bbio->num_stripes - 1;
 966
 967         rbio->nr_data = nr_data;
 968         return rbio;
 969 }
 970
 971 /* allocate pages for all the stripes in the bio, including parity */
 972 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 973 {
 974         int i;
 975         struct page *page;
 976
 977         for (i = 0; i < rbio->nr_pages; i++) {
 978                 if (rbio->stripe_pages[i])
 979                         continue;
 980                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 981                 if (!page)
 982                         return -ENOMEM;
 983                 rbio->stripe_pages[i] = page;
 984                 ClearPageUptodate(page);
 985         }
 986         return 0;
 987 }
 988
 989 /* allocate pages for just the p/q stripes */
 990 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 991 {
 992         int i;
 993         struct page *page;
 994
 995         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 996
 997         for (; i < rbio->nr_pages; i++) {
 998                 if (rbio->stripe_pages[i])
 999                         continue;
1000                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1001                 if (!page)
1002                         return -ENOMEM;
1003                 rbio->stripe_pages[i] = page;
1004         }
1005         return 0;
1006 }
1007
1008 /*
1009  * add a single page from a specific stripe into our list of bios for IO
1010  * this will try to merge into existing bios if possible, and returns
1011  * zero if all went well.
1012  */
1013 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1014                             struct bio_list *bio_list,
1015                             struct page *page,
1016                             int stripe_nr,
1017                             unsigned long page_index,
1018                             unsigned long bio_max_len)
1019 {
1020         struct bio *last = bio_list->tail;
1021         u64 last_end = 0;
1022         int ret;
1023         struct bio *bio;
1024         struct btrfs_bio_stripe *stripe;
1025         u64 disk_start;
1026
1027         stripe = &rbio->bbio->stripes[stripe_nr];
1028         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1029
1030         /* if the device is missing, just fail this stripe */
1031         if (!stripe->dev->bdev)
1032                 return fail_rbio_index(rbio, stripe_nr);
1033
1034         /* see if we can add this page onto our existing bio */
1035         if (last) {
1036                 last_end = (u64)last->bi_sector << 9;
1037                 last_end += last->bi_size;
1038
1039                 /*
1040                  * we can't merge these if they are from different
1041                  * devices or if they are not contiguous
1042                  */
1043                 if (last_end == disk_start && stripe->dev->bdev &&
1044                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1045                     last->bi_bdev == stripe->dev->bdev) {
1046                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1047                         if (ret == PAGE_CACHE_SIZE)
1048                                 return 0;
1049                 }
1050         }
1051
1052         /* put a new bio on the list */
1053         bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1054         if (!bio)
1055                 return -ENOMEM;
1056
1057         bio->bi_size = 0;
1058         bio->bi_bdev = stripe->dev->bdev;
1059         bio->bi_sector = disk_start >> 9;
1060         set_bit(BIO_UPTODATE, &bio->bi_flags);
1061
1062         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1063         bio_list_add(bio_list, bio);
1064         return 0;
1065 }
1066
1067 /*
1068  * while we're doing the read/modify/write cycle, we could
1069  * have errors in reading pages off the disk.  This checks
1070  * for errors and if we're not able to read the page it'll
1071  * trigger parity reconstruction.  The rmw will be finished
1072  * after we've reconstructed the failed stripes
1073  */
1074 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1075 {
1076         if (rbio->faila >= 0 || rbio->failb >= 0) {
1077                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1078                 __raid56_parity_recover(rbio);
1079         } else {
1080                 finish_rmw(rbio);
1081         }
1082 }
1083
1084 /*
1085  * these are just the pages from the rbio array, not from anything
1086  * the FS sent down to us
1087  */
1088 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1089 {
1090         int index;
1091         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1092         index += page;
1093         return rbio->stripe_pages[index];
1094 }
1095
1096 /*
1097  * helper function to walk our bio list and populate the bio_pages array with
1098  * the result.  This seems expensive, but it is faster than constantly
1099  * searching through the bio list as we setup the IO in finish_rmw or stripe
1100  * reconstruction.
1101  *
1102  * This must be called before you trust the answers from page_in_rbio
1103  */
1104 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1105 {
1106         struct bio *bio;
1107         u64 start;
1108         unsigned long stripe_offset;
1109         unsigned long page_index;
1110         struct page *p;
1111         int i;
1112
1113         spin_lock_irq(&rbio->bio_list_lock);
1114         bio_list_for_each(bio, &rbio->bio_list) {
1115                 start = (u64)bio->bi_sector << 9;
1116                 stripe_offset = start - rbio->raid_map[0];
1117                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1118
1119                 for (i = 0; i < bio->bi_vcnt; i++) {
1120                         p = bio->bi_io_vec[i].bv_page;
1121                         rbio->bio_pages[page_index + i] = p;
1122                 }
1123         }
1124         spin_unlock_irq(&rbio->bio_list_lock);
1125 }
1126
1127 /*
1128  * this is called from one of two situations.  We either
1129  * have a full stripe from the higher layers, or we've read all
1130  * the missing bits off disk.
1131  *
1132  * This will calculate the parity and then send down any
1133  * changed blocks.
1134  */
1135 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1136 {
1137         struct btrfs_bio *bbio = rbio->bbio;
1138         void *pointers[bbio->num_stripes];
1139         int stripe_len = rbio->stripe_len;
1140         int nr_data = rbio->nr_data;
1141         int stripe;
1142         int pagenr;
1143         int p_stripe = -1;
1144         int q_stripe = -1;
1145         struct bio_list bio_list;
1146         struct bio *bio;
1147         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1148         int ret;
1149
1150         bio_list_init(&bio_list);
1151
1152         if (bbio->num_stripes - rbio->nr_data == 1) {
1153                 p_stripe = bbio->num_stripes - 1;
1154         } else if (bbio->num_stripes - rbio->nr_data == 2) {
1155                 p_stripe = bbio->num_stripes - 2;
1156                 q_stripe = bbio->num_stripes - 1;
1157         } else {
1158                 BUG();
1159         }
1160
1161         /* at this point we either have a full stripe,
1162          * or we've read the full stripe from the drive.
1163          * recalculate the parity and write the new results.
1164          *
1165          * We're not allowed to add any new bios to the
1166          * bio list here, anyone else that wants to
1167          * change this stripe needs to do their own rmw.
1168          */
1169         spin_lock_irq(&rbio->bio_list_lock);
1170         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1171         spin_unlock_irq(&rbio->bio_list_lock);
1172
1173         atomic_set(&rbio->bbio->error, 0);
1174
1175         /*
1176          * now that we've set rmw_locked, run through the
1177          * bio list one last time and map the page pointers
1178          *
1179          * We don't cache full rbios because we're assuming
1180          * the higher layers are unlikely to use this area of
1181          * the disk again soon.  If they do use it again,
1182          * hopefully they will send another full bio.
1183          */
1184         index_rbio_pages(rbio);
1185         if (!rbio_is_full(rbio))
1186                 cache_rbio_pages(rbio);
1187         else
1188                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1189
1190         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1191                 struct page *p;
1192                 /* first collect one page from each data stripe */
1193                 for (stripe = 0; stripe < nr_data; stripe++) {
1194                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1195                         pointers[stripe] = kmap(p);
1196                 }
1197
1198                 /* then add the parity stripe */
1199                 p = rbio_pstripe_page(rbio, pagenr);
1200                 SetPageUptodate(p);
1201                 pointers[stripe++] = kmap(p);
1202
1203                 if (q_stripe != -1) {
1204
1205                         /*
1206                          * raid6, add the qstripe and call the
1207                          * library function to fill in our p/q
1208                          */
1209                         p = rbio_qstripe_page(rbio, pagenr);
1210                         SetPageUptodate(p);
1211                         pointers[stripe++] = kmap(p);
1212
1213                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1214                                                 pointers);
1215                 } else {
1216                         /* raid5 */
1217                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1218                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1219                 }
1220
1221
1222                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1223                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1224         }
1225
1226         /*
1227          * time to start writing.  Make bios for everything from the
1228          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1229          * everything else.
1230          */
1231         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1232                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1233                         struct page *page;
1234                         if (stripe < rbio->nr_data) {
1235                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1236                                 if (!page)
1237                                         continue;
1238                         } else {
1239                                page = rbio_stripe_page(rbio, stripe, pagenr);
1240                         }
1241
1242                         ret = rbio_add_io_page(rbio, &bio_list,
1243                                        page, stripe, pagenr, rbio->stripe_len);
1244                         if (ret)
1245                                 goto cleanup;
1246                 }
1247         }
1248
1249         atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1250         BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1251
1252         while (1) {
1253                 bio = bio_list_pop(&bio_list);
1254                 if (!bio)
1255                         break;
1256
1257                 bio->bi_private = rbio;
1258                 bio->bi_end_io = raid_write_end_io;
1259                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1260                 submit_bio(WRITE, bio);
1261         }
1262         return;
1263
1264 cleanup:
1265         rbio_orig_end_io(rbio, -EIO, 0);
1266 }
1267
1268 /*
1269  * helper to find the stripe number for a given bio.  Used to figure out which
1270  * stripe has failed.  This expects the bio to correspond to a physical disk,
1271  * so it looks up based on physical sector numbers.
1272  */
1273 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1274                            struct bio *bio)
1275 {
1276         u64 physical = bio->bi_sector;
1277         u64 stripe_start;
1278         int i;
1279         struct btrfs_bio_stripe *stripe;
1280
1281         physical <<= 9;
1282
1283         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1284                 stripe = &rbio->bbio->stripes[i];
1285                 stripe_start = stripe->physical;
1286                 if (physical >= stripe_start &&
1287                     physical < stripe_start + rbio->stripe_len) {
1288                         return i;
1289                 }
1290         }
1291         return -1;
1292 }
1293
1294 /*
1295  * helper to find the stripe number for a given
1296  * bio (before mapping).  Used to figure out which stripe has
1297  * failed.  This looks up based on logical block numbers.
1298  */
1299 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1300                                    struct bio *bio)
1301 {
1302         u64 logical = bio->bi_sector;
1303         u64 stripe_start;
1304         int i;
1305
1306         logical <<= 9;
1307
1308         for (i = 0; i < rbio->nr_data; i++) {
1309                 stripe_start = rbio->raid_map[i];
1310                 if (logical >= stripe_start &&
1311                     logical < stripe_start + rbio->stripe_len) {
1312                         return i;
1313                 }
1314         }
1315         return -1;
1316 }
1317
1318 /*
1319  * returns -EIO if we had too many failures
1320  */
1321 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1322 {
1323         unsigned long flags;
1324         int ret = 0;
1325
1326         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1327
1328         /* we already know this stripe is bad, move on */
1329         if (rbio->faila == failed || rbio->failb == failed)
1330                 goto out;
1331
1332         if (rbio->faila == -1) {
1333                 /* first failure on this rbio */
1334                 rbio->faila = failed;
1335                 atomic_inc(&rbio->bbio->error);
1336         } else if (rbio->failb == -1) {
1337                 /* second failure on this rbio */
1338                 rbio->failb = failed;
1339                 atomic_inc(&rbio->bbio->error);
1340         } else {
1341                 ret = -EIO;
1342         }
1343 out:
1344         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1345
1346         return ret;
1347 }
1348
1349 /*
1350  * helper to fail a stripe based on a physical disk
1351  * bio.
1352  */
1353 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1354                            struct bio *bio)
1355 {
1356         int failed = find_bio_stripe(rbio, bio);
1357
1358         if (failed < 0)
1359                 return -EIO;
1360
1361         return fail_rbio_index(rbio, failed);
1362 }
1363
1364 /*
1365  * this sets each page in the bio uptodate.  It should only be used on private
1366  * rbio pages, nothing that comes in from the higher layers
1367  */
1368 static void set_bio_pages_uptodate(struct bio *bio)
1369 {
1370         int i;
1371         struct page *p;
1372
1373         for (i = 0; i < bio->bi_vcnt; i++) {
1374                 p = bio->bi_io_vec[i].bv_page;
1375                 SetPageUptodate(p);
1376         }
1377 }
1378
1379 /*
1380  * end io for the read phase of the rmw cycle.  All the bios here are physical
1381  * stripe bios we've read from the disk so we can recalculate the parity of the
1382  * stripe.
1383  *
1384  * This will usually kick off finish_rmw once all the bios are read in, but it
1385  * may trigger parity reconstruction if we had any errors along the way
1386  */
1387 static void raid_rmw_end_io(struct bio *bio, int err)
1388 {
1389         struct btrfs_raid_bio *rbio = bio->bi_private;
1390
1391         if (err)
1392                 fail_bio_stripe(rbio, bio);
1393         else
1394                 set_bio_pages_uptodate(bio);
1395
1396         bio_put(bio);
1397
1398         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1399                 return;
1400
1401         err = 0;
1402         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1403                 goto cleanup;
1404
1405         /*
1406          * this will normally call finish_rmw to start our write
1407          * but if there are any failed stripes we'll reconstruct
1408          * from parity first
1409          */
1410         validate_rbio_for_rmw(rbio);
1411         return;
1412
1413 cleanup:
1414
1415         rbio_orig_end_io(rbio, -EIO, 0);
1416 }
1417
1418 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1419 {
1420         rbio->work.flags = 0;
1421         rbio->work.func = rmw_work;
1422
1423         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1424                            &rbio->work);
1425 }
1426
1427 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1428 {
1429         rbio->work.flags = 0;
1430         rbio->work.func = read_rebuild_work;
1431
1432         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1433                            &rbio->work);
1434 }
1435
1436 /*
1437  * the stripe must be locked by the caller.  It will
1438  * unlock after all the writes are done
1439  */
1440 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1441 {
1442         int bios_to_read = 0;
1443         struct btrfs_bio *bbio = rbio->bbio;
1444         struct bio_list bio_list;
1445         int ret;
1446         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1447         int pagenr;
1448         int stripe;
1449         struct bio *bio;
1450
1451         bio_list_init(&bio_list);
1452
1453         ret = alloc_rbio_pages(rbio);
1454         if (ret)
1455                 goto cleanup;
1456
1457         index_rbio_pages(rbio);
1458
1459         atomic_set(&rbio->bbio->error, 0);
1460         /*
1461          * build a list of bios to read all the missing parts of this
1462          * stripe
1463          */
1464         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1465                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1466                         struct page *page;
1467                         /*
1468                          * we want to find all the pages missing from
1469                          * the rbio and read them from the disk.  If
1470                          * page_in_rbio finds a page in the bio list
1471                          * we don't need to read it off the stripe.
1472                          */
1473                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1474                         if (page)
1475                                 continue;
1476
1477                         page = rbio_stripe_page(rbio, stripe, pagenr);
1478                         /*
1479                          * the bio cache may have handed us an uptodate
1480                          * page.  If so, be happy and use it
1481                          */
1482                         if (PageUptodate(page))
1483                                 continue;
1484
1485                         ret = rbio_add_io_page(rbio, &bio_list, page,
1486                                        stripe, pagenr, rbio->stripe_len);
1487                         if (ret)
1488                                 goto cleanup;
1489                 }
1490         }
1491
1492         bios_to_read = bio_list_size(&bio_list);
1493         if (!bios_to_read) {
1494                 /*
1495                  * this can happen if others have merged with
1496                  * us, it means there is nothing left to read.
1497                  * But if there are missing devices it may not be
1498                  * safe to do the full stripe write yet.
1499                  */
1500                 goto finish;
1501         }
1502
1503         /*
1504          * the bbio may be freed once we submit the last bio.  Make sure
1505          * not to touch it after that
1506          */
1507         atomic_set(&bbio->stripes_pending, bios_to_read);
1508         while (1) {
1509                 bio = bio_list_pop(&bio_list);
1510                 if (!bio)
1511                         break;
1512
1513                 bio->bi_private = rbio;
1514                 bio->bi_end_io = raid_rmw_end_io;
1515
1516                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1517                                     BTRFS_WQ_ENDIO_RAID56);
1518
1519                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1520                 submit_bio(READ, bio);
1521         }
1522         /* the actual write will happen once the reads are done */
1523         return 0;
1524
1525 cleanup:
1526         rbio_orig_end_io(rbio, -EIO, 0);
1527         return -EIO;
1528
1529 finish:
1530         validate_rbio_for_rmw(rbio);
1531         return 0;
1532 }
1533
1534 /*
1535  * if the upper layers pass in a full stripe, we thank them by only allocating
1536  * enough pages to hold the parity, and sending it all down quickly.
1537  */
1538 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1539 {
1540         int ret;
1541
1542         ret = alloc_rbio_parity_pages(rbio);
1543         if (ret) {
1544                 __free_raid_bio(rbio);
1545                 return ret;
1546         }
1547
1548         ret = lock_stripe_add(rbio);
1549         if (ret == 0)
1550                 finish_rmw(rbio);
1551         return 0;
1552 }
1553
1554 /*
1555  * partial stripe writes get handed over to async helpers.
1556  * We're really hoping to merge a few more writes into this
1557  * rbio before calculating new parity
1558  */
1559 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1560 {
1561         int ret;
1562
1563         ret = lock_stripe_add(rbio);
1564         if (ret == 0)
1565                 async_rmw_stripe(rbio);
1566         return 0;
1567 }
1568
1569 /*
1570  * sometimes while we were reading from the drive to
1571  * recalculate parity, enough new bios come into create
1572  * a full stripe.  So we do a check here to see if we can
1573  * go directly to finish_rmw
1574  */
1575 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1576 {
1577         /* head off into rmw land if we don't have a full stripe */
1578         if (!rbio_is_full(rbio))
1579                 return partial_stripe_write(rbio);
1580         return full_stripe_write(rbio);
1581 }
1582
1583 /*
1584  * We use plugging call backs to collect full stripes.
1585  * Any time we get a partial stripe write while plugged
1586  * we collect it into a list.  When the unplug comes down,
1587  * we sort the list by logical block number and merge
1588  * everything we can into the same rbios
1589  */
1590 struct btrfs_plug_cb {
1591         struct blk_plug_cb cb;
1592         struct btrfs_fs_info *info;
1593         struct list_head rbio_list;
1594         struct btrfs_work work;
1595 };
1596
1597 /*
1598  * rbios on the plug list are sorted for easier merging.
1599  */
1600 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1601 {
1602         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1603                                                  plug_list);
1604         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1605                                                  plug_list);
1606         u64 a_sector = ra->bio_list.head->bi_sector;
1607         u64 b_sector = rb->bio_list.head->bi_sector;
1608
1609         if (a_sector < b_sector)
1610                 return -1;
1611         if (a_sector > b_sector)
1612                 return 1;
1613         return 0;
1614 }
1615
1616 static void run_plug(struct btrfs_plug_cb *plug)
1617 {
1618         struct btrfs_raid_bio *cur;
1619         struct btrfs_raid_bio *last = NULL;
1620
1621         /*
1622          * sort our plug list then try to merge
1623          * everything we can in hopes of creating full
1624          * stripes.
1625          */
1626         list_sort(NULL, &plug->rbio_list, plug_cmp);
1627         while (!list_empty(&plug->rbio_list)) {
1628                 cur = list_entry(plug->rbio_list.next,
1629                                  struct btrfs_raid_bio, plug_list);
1630                 list_del_init(&cur->plug_list);
1631
1632                 if (rbio_is_full(cur)) {
1633                         /* we have a full stripe, send it down */
1634                         full_stripe_write(cur);
1635                         continue;
1636                 }
1637                 if (last) {
1638                         if (rbio_can_merge(last, cur)) {
1639                                 merge_rbio(last, cur);
1640                                 __free_raid_bio(cur);
1641                                 continue;
1642
1643                         }
1644                         __raid56_parity_write(last);
1645                 }
1646                 last = cur;
1647         }
1648         if (last) {
1649                 __raid56_parity_write(last);
1650         }
1651         kfree(plug);
1652 }
1653
1654 /*
1655  * if the unplug comes from schedule, we have to push the
1656  * work off to a helper thread
1657  */
1658 static void unplug_work(struct btrfs_work *work)
1659 {
1660         struct btrfs_plug_cb *plug;
1661         plug = container_of(work, struct btrfs_plug_cb, work);
1662         run_plug(plug);
1663 }
1664
1665 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1666 {
1667         struct btrfs_plug_cb *plug;
1668         plug = container_of(cb, struct btrfs_plug_cb, cb);
1669
1670         if (from_schedule) {
1671                 plug->work.flags = 0;
1672                 plug->work.func = unplug_work;
1673                 btrfs_queue_worker(&plug->info->rmw_workers,
1674                                    &plug->work);
1675                 return;
1676         }
1677         run_plug(plug);
1678 }
1679
1680 /*
1681  * our main entry point for writes from the rest of the FS.
1682  */
1683 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1684                         struct btrfs_bio *bbio, u64 *raid_map,
1685                         u64 stripe_len)
1686 {
1687         struct btrfs_raid_bio *rbio;
1688         struct btrfs_plug_cb *plug = NULL;
1689         struct blk_plug_cb *cb;
1690
1691         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1692         if (IS_ERR(rbio))
1693                 return PTR_ERR(rbio);
1694         bio_list_add(&rbio->bio_list, bio);
1695         rbio->bio_list_bytes = bio->bi_size;
1696
1697         /*
1698          * don't plug on full rbios, just get them out the door
1699          * as quickly as we can
1700          */
1701         if (rbio_is_full(rbio))
1702                 return full_stripe_write(rbio);
1703
1704         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1705                                sizeof(*plug));
1706         if (cb) {
1707                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1708                 if (!plug->info) {
1709                         plug->info = root->fs_info;
1710                         INIT_LIST_HEAD(&plug->rbio_list);
1711                 }
1712                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1713         } else {
1714                 return __raid56_parity_write(rbio);
1715         }
1716         return 0;
1717 }
1718
1719 /*
1720  * all parity reconstruction happens here.  We've read in everything
1721  * we can find from the drives and this does the heavy lifting of
1722  * sorting the good from the bad.
1723  */
1724 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1725 {
1726         int pagenr, stripe;
1727         void **pointers;
1728         int faila = -1, failb = -1;
1729         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1730         struct page *page;
1731         int err;
1732         int i;
1733
1734         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1735                            GFP_NOFS);
1736         if (!pointers) {
1737                 err = -ENOMEM;
1738                 goto cleanup_io;
1739         }
1740
1741         faila = rbio->faila;
1742         failb = rbio->failb;
1743
1744         if (rbio->read_rebuild) {
1745                 spin_lock_irq(&rbio->bio_list_lock);
1746                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1747                 spin_unlock_irq(&rbio->bio_list_lock);
1748         }
1749
1750         index_rbio_pages(rbio);
1751
1752         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1753                 /* setup our array of pointers with pages
1754                  * from each stripe
1755                  */
1756                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1757                         /*
1758                          * if we're rebuilding a read, we have to use
1759                          * pages from the bio list
1760                          */
1761                         if (rbio->read_rebuild &&
1762                             (stripe == faila || stripe == failb)) {
1763                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1764                         } else {
1765                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1766                         }
1767                         pointers[stripe] = kmap(page);
1768                 }
1769
1770                 /* all raid6 handling here */
1771                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1772                     RAID6_Q_STRIPE) {
1773
1774                         /*
1775                          * single failure, rebuild from parity raid5
1776                          * style
1777                          */
1778                         if (failb < 0) {
1779                                 if (faila == rbio->nr_data) {
1780                                         /*
1781                                          * Just the P stripe has failed, without
1782                                          * a bad data or Q stripe.
1783                                          * TODO, we should redo the xor here.
1784                                          */
1785                                         err = -EIO;
1786                                         goto cleanup;
1787                                 }
1788                                 /*
1789                                  * a single failure in raid6 is rebuilt
1790                                  * in the pstripe code below
1791                                  */
1792                                 goto pstripe;
1793                         }
1794
1795                         /* make sure our ps and qs are in order */
1796                         if (faila > failb) {
1797                                 int tmp = failb;
1798                                 failb = faila;
1799                                 faila = tmp;
1800                         }
1801
1802                         /* if the q stripe is failed, do a pstripe reconstruction
1803                          * from the xors.
1804                          * If both the q stripe and the P stripe are failed, we're
1805                          * here due to a crc mismatch and we can't give them the
1806                          * data they want
1807                          */
1808                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1809                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1810                                         err = -EIO;
1811                                         goto cleanup;
1812                                 }
1813                                 /*
1814                                  * otherwise we have one bad data stripe and
1815                                  * a good P stripe.  raid5!
1816                                  */
1817                                 goto pstripe;
1818                         }
1819
1820                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1821                                 raid6_datap_recov(rbio->bbio->num_stripes,
1822                                                   PAGE_SIZE, faila, pointers);
1823                         } else {
1824                                 raid6_2data_recov(rbio->bbio->num_stripes,
1825                                                   PAGE_SIZE, faila, failb,
1826                                                   pointers);
1827                         }
1828                 } else {
1829                         void *p;
1830
1831                         /* rebuild from P stripe here (raid5 or raid6) */
1832                         BUG_ON(failb != -1);
1833 pstripe:
1834                         /* Copy parity block into failed block to start with */
1835                         memcpy(pointers[faila],
1836                                pointers[rbio->nr_data],
1837                                PAGE_CACHE_SIZE);
1838
1839                         /* rearrange the pointer array */
1840                         p = pointers[faila];
1841                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1842                                 pointers[stripe] = pointers[stripe + 1];
1843                         pointers[rbio->nr_data - 1] = p;
1844
1845                         /* xor in the rest */
1846                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1847                 }
1848                 /* if we're doing this rebuild as part of an rmw, go through
1849                  * and set all of our private rbio pages in the
1850                  * failed stripes as uptodate.  This way finish_rmw will
1851                  * know they can be trusted.  If this was a read reconstruction,
1852                  * other endio functions will fiddle the uptodate bits
1853                  */
1854                 if (!rbio->read_rebuild) {
1855                         for (i = 0;  i < nr_pages; i++) {
1856                                 if (faila != -1) {
1857                                         page = rbio_stripe_page(rbio, faila, i);
1858                                         SetPageUptodate(page);
1859                                 }
1860                                 if (failb != -1) {
1861                                         page = rbio_stripe_page(rbio, failb, i);
1862                                         SetPageUptodate(page);
1863                                 }
1864                         }
1865                 }
1866                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1867                         /*
1868                          * if we're rebuilding a read, we have to use
1869                          * pages from the bio list
1870                          */
1871                         if (rbio->read_rebuild &&
1872                             (stripe == faila || stripe == failb)) {
1873                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1874                         } else {
1875                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1876                         }
1877                         kunmap(page);
1878                 }
1879         }
1880
1881         err = 0;
1882 cleanup:
1883         kfree(pointers);
1884
1885 cleanup_io:
1886
1887         if (rbio->read_rebuild) {
1888                 if (err == 0)
1889                         cache_rbio_pages(rbio);
1890                 else
1891                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1892
1893                 rbio_orig_end_io(rbio, err, err == 0);
1894         } else if (err == 0) {
1895                 rbio->faila = -1;
1896                 rbio->failb = -1;
1897                 finish_rmw(rbio);
1898         } else {
1899                 rbio_orig_end_io(rbio, err, 0);
1900         }
1901 }
1902
1903 /*
1904  * This is called only for stripes we've read from disk to
1905  * reconstruct the parity.
1906  */
1907 static void raid_recover_end_io(struct bio *bio, int err)
1908 {
1909         struct btrfs_raid_bio *rbio = bio->bi_private;
1910
1911         /*
1912          * we only read stripe pages off the disk, set them
1913          * up to date if there were no errors
1914          */
1915         if (err)
1916                 fail_bio_stripe(rbio, bio);
1917         else
1918                 set_bio_pages_uptodate(bio);
1919         bio_put(bio);
1920
1921         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1922                 return;
1923
1924         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1925                 rbio_orig_end_io(rbio, -EIO, 0);
1926         else
1927                 __raid_recover_end_io(rbio);
1928 }
1929
1930 /*
1931  * reads everything we need off the disk to reconstruct
1932  * the parity. endio handlers trigger final reconstruction
1933  * when the IO is done.
1934  *
1935  * This is used both for reads from the higher layers and for
1936  * parity construction required to finish a rmw cycle.
1937  */
1938 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1939 {
1940         int bios_to_read = 0;
1941         struct btrfs_bio *bbio = rbio->bbio;
1942         struct bio_list bio_list;
1943         int ret;
1944         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1945         int pagenr;
1946         int stripe;
1947         struct bio *bio;
1948
1949         bio_list_init(&bio_list);
1950
1951         ret = alloc_rbio_pages(rbio);
1952         if (ret)
1953                 goto cleanup;
1954
1955         atomic_set(&rbio->bbio->error, 0);
1956
1957         /*
1958          * read everything that hasn't failed.  Thanks to the
1959          * stripe cache, it is possible that some or all of these
1960          * pages are going to be uptodate.
1961          */
1962         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1963                 if (rbio->faila == stripe ||
1964                     rbio->failb == stripe)
1965                         continue;
1966
1967                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1968                         struct page *p;
1969
1970                         /*
1971                          * the rmw code may have already read this
1972                          * page in
1973                          */
1974                         p = rbio_stripe_page(rbio, stripe, pagenr);
1975                         if (PageUptodate(p))
1976                                 continue;
1977
1978                         ret = rbio_add_io_page(rbio, &bio_list,
1979                                        rbio_stripe_page(rbio, stripe, pagenr),
1980                                        stripe, pagenr, rbio->stripe_len);
1981                         if (ret < 0)
1982                                 goto cleanup;
1983                 }
1984         }
1985
1986         bios_to_read = bio_list_size(&bio_list);
1987         if (!bios_to_read) {
1988                 /*
1989                  * we might have no bios to read just because the pages
1990                  * were up to date, or we might have no bios to read because
1991                  * the devices were gone.
1992                  */
1993                 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1994                         __raid_recover_end_io(rbio);
1995                         goto out;
1996                 } else {
1997                         goto cleanup;
1998                 }
1999         }
2000
2001         /*
2002          * the bbio may be freed once we submit the last bio.  Make sure
2003          * not to touch it after that
2004          */
2005         atomic_set(&bbio->stripes_pending, bios_to_read);
2006         while (1) {
2007                 bio = bio_list_pop(&bio_list);
2008                 if (!bio)
2009                         break;
2010
2011                 bio->bi_private = rbio;
2012                 bio->bi_end_io = raid_recover_end_io;
2013
2014                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2015                                     BTRFS_WQ_ENDIO_RAID56);
2016
2017                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2018                 submit_bio(READ, bio);
2019         }
2020 out:
2021         return 0;
2022
2023 cleanup:
2024         if (rbio->read_rebuild)
2025                 rbio_orig_end_io(rbio, -EIO, 0);
2026         return -EIO;
2027 }
2028
2029 /*
2030  * the main entry point for reads from the higher layers.  This
2031  * is really only called when the normal read path had a failure,
2032  * so we assume the bio they send down corresponds to a failed part
2033  * of the drive.
2034  */
2035 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036                           struct btrfs_bio *bbio, u64 *raid_map,
2037                           u64 stripe_len, int mirror_num)
2038 {
2039         struct btrfs_raid_bio *rbio;
2040         int ret;
2041
2042         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043         if (IS_ERR(rbio))
2044                 return PTR_ERR(rbio);
2045
2046         rbio->read_rebuild = 1;
2047         bio_list_add(&rbio->bio_list, bio);
2048         rbio->bio_list_bytes = bio->bi_size;
2049
2050         rbio->faila = find_logical_bio_stripe(rbio, bio);
2051         if (rbio->faila == -1) {
2052                 BUG();
2053                 kfree(raid_map);
2054                 kfree(bbio);
2055                 kfree(rbio);
2056                 return -EIO;
2057         }
2058
2059         /*
2060          * reconstruct from the q stripe if they are
2061          * asking for mirror 3
2062          */
2063         if (mirror_num == 3)
2064                 rbio->failb = bbio->num_stripes - 2;
2065
2066         ret = lock_stripe_add(rbio);
2067
2068         /*
2069          * __raid56_parity_recover will end the bio with
2070          * any errors it hits.  We don't want to return
2071          * its error value up the stack because our caller
2072          * will end up calling bio_endio with any nonzero
2073          * return
2074          */
2075         if (ret == 0)
2076                 __raid56_parity_recover(rbio);
2077         /*
2078          * our rbio has been added to the list of
2079          * rbios that will be handled after the
2080          * currently lock owner is done
2081          */
2082         return 0;
2083
2084 }
2085
2086 static void rmw_work(struct btrfs_work *work)
2087 {
2088         struct btrfs_raid_bio *rbio;
2089
2090         rbio = container_of(work, struct btrfs_raid_bio, work);
2091         raid56_rmw_stripe(rbio);
2092 }
2093
2094 static void read_rebuild_work(struct btrfs_work *work)
2095 {
2096         struct btrfs_raid_bio *rbio;
2097
2098         rbio = container_of(work, struct btrfs_raid_bio, work);
2099         __raid56_parity_recover(rbio);
2100 }