drivers/md/dm-raid45.c

   1 /*
   2  * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
   3  *
   4  * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
   5  *
   6  * This file is released under the GPL.
   7  *
   8  *
   9  * Linux 2.6 Device Mapper RAID4 and RAID5 target.
  10  *
  11  * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
  12  *
  13  *
  14  * Supports the following ATARAID vendor solutions (and SNIA DDF):
  15  *
  16  *      Adaptec HostRAID ASR
  17  *      SNIA DDF1
  18  *      Hiphpoint 37x
  19  *      Hiphpoint 45x
  20  *      Intel IMSM
  21  *      Jmicron ATARAID
  22  *      LSI Logic MegaRAID
  23  *      NVidia RAID
  24  *      Promise FastTrack
  25  *      Silicon Image Medley
  26  *      VIA Software RAID
  27  *
  28  * via the dmraid application.
  29  *
  30  *
  31  * Features:
  32  *
  33  *      o RAID4 with dedicated and selectable parity device
  34  *      o RAID5 with rotating parity (left+right, symmetric+asymmetric)
  35  *      o recovery of out of sync device for initial
  36  *        RAID set creation or after dead drive replacement
  37  *      o run time optimization of xor algorithm used to calculate parity
  38  *
  39  *
  40  * Thanks to MD for:
  41  *    o the raid address calculation algorithm
  42  *    o the base of the biovec <-> page list copier.
  43  *
  44  *
  45  * Uses region hash to keep track of how many writes are in flight to
  46  * regions in order to use dirty log to keep state of regions to recover:
  47  *
  48  *    o clean regions (those which are synchronized
  49  *      and don't have write io in flight)
  50  *    o dirty regions (those with write io in flight)
  51  *
  52  *
  53  * On startup, any dirty regions are migrated to the
  54  * 'nosync' state and are subject to recovery by the daemon.
  55  *
  56  * See raid_ctr() for table definition.
  57  *
  58  * ANALYZEME: recovery bandwidth
  59  */
  60
  61 static const char *version = "v0.2597k";
  62
  63 #include "dm.h"
  64 #include "dm-memcache.h"
  65 #include "dm-raid45.h"
  66
  67 #include <linux/kernel.h>
  68 #include <linux/vmalloc.h>
  69 #include <linux/raid/xor.h>
  70 #include <linux/slab.h>
  71 #include <linux/module.h>
  72
  73 #include <linux/bio.h>
  74 #include <linux/dm-io.h>
  75 #include <linux/dm-dirty-log.h>
  76 #include <linux/dm-region-hash.h>
  77
  78
  79 /*
  80  * Configurable parameters
  81  */
  82
  83 /* Minimum/maximum and default # of selectable stripes. */
  84 #define STRIPES_MIN             8
  85 #define STRIPES_MAX             16384
  86 #define STRIPES_DEFAULT         80
  87
  88 /* Maximum and default chunk size in sectors if not set in constructor. */
  89 #define CHUNK_SIZE_MIN          8
  90 #define CHUNK_SIZE_MAX          16384
  91 #define CHUNK_SIZE_DEFAULT      64
  92
  93 /* Default io size in sectors if not set in constructor. */
  94 #define IO_SIZE_MIN             CHUNK_SIZE_MIN
  95 #define IO_SIZE_DEFAULT         IO_SIZE_MIN
  96
  97 /* Recover io size default in sectors. */
  98 #define RECOVER_IO_SIZE_MIN             64
  99 #define RECOVER_IO_SIZE_DEFAULT         256
 100
 101 /* Default, minimum and maximum percentage of recover io bandwidth. */
 102 #define BANDWIDTH_DEFAULT       10
 103 #define BANDWIDTH_MIN           1
 104 #define BANDWIDTH_MAX           100
 105
 106 /* # of parallel recovered regions */
 107 #define RECOVERY_STRIPES_MIN    1
 108 #define RECOVERY_STRIPES_MAX    64
 109 #define RECOVERY_STRIPES_DEFAULT        RECOVERY_STRIPES_MIN
 110 /*
 111  * END Configurable parameters
 112  */
 113
 114 #define TARGET  "dm-raid45"
 115 #define DAEMON  "kraid45d"
 116 #define DM_MSG_PREFIX   TARGET
 117
 118 #define SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
 119
 120 /* Amount/size for __xor(). */
 121 #define XOR_SIZE        PAGE_SIZE
 122
 123 /* Ticks to run xor_speed() test for. */
 124 #define XOR_SPEED_TICKS 5
 125
 126 /* Check value in range. */
 127 #define range_ok(i, min, max)   (i >= min && i <= max)
 128
 129 /* Structure access macros. */
 130 /* Derive raid_set from stripe_cache pointer. */
 131 #define RS(x)   container_of(x, struct raid_set, sc)
 132
 133 /* Page reference. */
 134 #define PAGE(stripe, p)  ((stripe)->obj[p].pl->page)
 135
 136 /* Stripe chunk reference. */
 137 #define CHUNK(stripe, p) ((stripe)->chunk + p)
 138
 139 /* Bio list reference. */
 140 #define BL(stripe, p, rw)       (stripe->chunk[p].bl + rw)
 141 #define BL_CHUNK(chunk, rw)     (chunk->bl + rw)
 142
 143 /* Page list reference. */
 144 #define PL(stripe, p)           (stripe->obj[p].pl)
 145 /* END: structure access macros. */
 146
 147 /* Factor out to dm-bio-list.h */
 148 static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
 149 {
 150         bio->bi_next = bl->head;
 151         bl->head = bio;
 152
 153         if (!bl->tail)
 154                 bl->tail = bio;
 155 }
 156
 157 /* Factor out to dm.h */
 158 #define TI_ERR_RET(str, ret) \
 159         do { ti->error = str; return ret; } while (0);
 160 #define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
 161
 162 /* Macro to define access IO flags access inline functions. */
 163 #define BITOPS(name, what, var, flag) \
 164 static inline int TestClear ## name ## what(struct var *v) \
 165 { return test_and_clear_bit(flag, &v->io.flags); } \
 166 static inline int TestSet ## name ## what(struct var *v) \
 167 { return test_and_set_bit(flag, &v->io.flags); } \
 168 static inline void Clear ## name ## what(struct var *v) \
 169 { clear_bit(flag, &v->io.flags); } \
 170 static inline void Set ## name ## what(struct var *v) \
 171 { set_bit(flag, &v->io.flags); } \
 172 static inline int name ## what(struct var *v) \
 173 { return test_bit(flag, &v->io.flags); }
 174
 175 /*-----------------------------------------------------------------
 176  * Stripe cache
 177  *
 178  * Cache for all reads and writes to raid sets (operational or degraded)
 179  *
 180  * We need to run all data to and from a RAID set through this cache,
 181  * because parity chunks need to get calculated from data chunks
 182  * or, in the degraded/resynchronization case, missing chunks need
 183  * to be reconstructed using the other chunks of the stripe.
 184  *---------------------------------------------------------------*/
 185 /* Unique kmem cache name suffix # counter. */
 186 static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
 187
 188 /* A chunk within a stripe (holds bios hanging off). */
 189 /* IO status flags for chunks of a stripe. */
 190 enum chunk_flags {
 191         CHUNK_DIRTY,            /* Pages of chunk dirty; need writing. */
 192         CHUNK_ERROR,            /* IO error on any chunk page. */
 193         CHUNK_IO,               /* Allow/prohibit IO on chunk pages. */
 194         CHUNK_LOCKED,           /* Chunk pages locked during IO. */
 195         CHUNK_MUST_IO,          /* Chunk must io. */
 196         CHUNK_UNLOCK,           /* Enforce chunk unlock. */
 197         CHUNK_UPTODATE,         /* Chunk pages are uptodate. */
 198 };
 199
 200 enum bl_type {
 201         WRITE_QUEUED = WRITE + 1,
 202         WRITE_MERGED,
 203         NR_BL_TYPES,    /* Must be last one! */
 204 };
 205 struct stripe_chunk {
 206         atomic_t cnt;           /* Reference count. */
 207         struct stripe *stripe;  /* Backpointer to stripe for endio(). */
 208         /* Bio lists for reads, writes, and writes merged. */
 209         struct bio_list bl[NR_BL_TYPES];
 210         struct {
 211                 unsigned long flags; /* IO status flags. */
 212         } io;
 213 };
 214
 215 /* Define chunk bit operations. */
 216 BITOPS(Chunk, Dirty,     stripe_chunk, CHUNK_DIRTY)
 217 BITOPS(Chunk, Error,     stripe_chunk, CHUNK_ERROR)
 218 BITOPS(Chunk, Io,        stripe_chunk, CHUNK_IO)
 219 BITOPS(Chunk, Locked,    stripe_chunk, CHUNK_LOCKED)
 220 BITOPS(Chunk, MustIo,    stripe_chunk, CHUNK_MUST_IO)
 221 BITOPS(Chunk, Unlock,    stripe_chunk, CHUNK_UNLOCK)
 222 BITOPS(Chunk, Uptodate,  stripe_chunk, CHUNK_UPTODATE)
 223
 224 /*
 225  * Stripe linked list indexes. Keep order, because the stripe
 226  * and the stripe cache rely on the first 3!
 227  */
 228 enum list_types {
 229         LIST_FLUSH,     /* Stripes to flush for io. */
 230         LIST_ENDIO,     /* Stripes to endio. */
 231         LIST_LRU,       /* Least recently used stripes. */
 232         SC_NR_LISTS,    /* # of lists in stripe cache. */
 233         LIST_HASH = SC_NR_LISTS,        /* Hashed stripes. */
 234         LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
 235         STRIPE_NR_LISTS,/* To size array in struct stripe. */
 236 };
 237
 238 /* Adressing region recovery. */
 239 struct recover_addr {
 240         struct dm_region *reg;  /* Actual region to recover. */
 241         sector_t pos;   /* Position within region to recover. */
 242         sector_t end;   /* End of region to recover. */
 243 };
 244
 245 /* A stripe: the io object to handle all reads and writes to a RAID set. */
 246 struct stripe {
 247         atomic_t cnt;                   /* Reference count. */
 248         struct stripe_cache *sc;        /* Backpointer to stripe cache. */
 249
 250         /*
 251          * 4 linked lists:
 252          *   o io list to flush io
 253          *   o endio list
 254          *   o LRU list to put stripes w/o reference count on
 255          *   o stripe cache hash
 256          */
 257         struct list_head lists[STRIPE_NR_LISTS];
 258
 259         sector_t key;    /* Hash key. */
 260         region_t region; /* Region stripe is mapped to. */
 261
 262         struct {
 263                 unsigned long flags;    /* Stripe state flags (see below). */
 264
 265                 /*
 266                  * Pending ios in flight:
 267                  *
 268                  * used to control move of stripe to endio list
 269                  */
 270                 atomic_t pending;
 271
 272                 /* Sectors to read and write for multi page stripe sets. */
 273                 unsigned size;
 274         } io;
 275
 276         /* Address region recovery. */
 277         struct recover_addr *recover;
 278
 279         /* Lock on stripe (Future: for clustering). */
 280         void *lock;
 281
 282         struct {
 283                 unsigned short parity;  /* Parity chunk index. */
 284                 short recover;          /* Recovery chunk index. */
 285         } idx;
 286
 287         /*
 288          * This stripe's memory cache object (dm-mem-cache);
 289          * i.e. the io chunk pages.
 290          */
 291         struct dm_mem_cache_object *obj;
 292
 293         /* Array of stripe sets (dynamically allocated). */
 294         struct stripe_chunk chunk[0];
 295 };
 296
 297 /* States stripes can be in (flags field). */
 298 enum stripe_states {
 299         STRIPE_ERROR,           /* io error on stripe. */
 300         STRIPE_MERGED,          /* Writes got merged to be written. */
 301         STRIPE_RBW,             /* Read-before-write stripe. */
 302         STRIPE_RECONSTRUCT,     /* Reconstruct of a missing chunk required. */
 303         STRIPE_RECONSTRUCTED,   /* Reconstructed of a missing chunk. */
 304         STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
 305 };
 306
 307 /* Define stripe bit operations. */
 308 BITOPS(Stripe, Error,         stripe, STRIPE_ERROR)
 309 BITOPS(Stripe, Merged,        stripe, STRIPE_MERGED)
 310 BITOPS(Stripe, RBW,           stripe, STRIPE_RBW)
 311 BITOPS(Stripe, Reconstruct,   stripe, STRIPE_RECONSTRUCT)
 312 BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
 313 BITOPS(Stripe, Recover,       stripe, STRIPE_RECOVER)
 314
 315 /* A stripe hash. */
 316 struct stripe_hash {
 317         struct list_head *hash;
 318         unsigned buckets;
 319         unsigned mask;
 320         unsigned prime;
 321         unsigned shift;
 322 };
 323
 324 enum sc_lock_types {
 325         LOCK_ENDIO,     /* Protect endio list. */
 326         NR_LOCKS,       /* To size array in struct stripe_cache. */
 327 };
 328
 329 /* A stripe cache. */
 330 struct stripe_cache {
 331         /* Stripe hash. */
 332         struct stripe_hash hash;
 333
 334         spinlock_t locks[NR_LOCKS];     /* Locks to protect lists. */
 335
 336         /* Stripes with io to flush, stripes to endio and LRU lists. */
 337         struct list_head lists[SC_NR_LISTS];
 338
 339         /* Slab cache to allocate stripes from. */
 340         struct {
 341                 struct kmem_cache *cache;       /* Cache itself. */
 342                 char name[32];  /* Unique name. */
 343         } kc;
 344
 345         struct dm_io_client *dm_io_client; /* dm-io client resource context. */
 346
 347         /* dm-mem-cache client resource context. */
 348         struct dm_mem_cache_client *mem_cache_client;
 349
 350         int stripes_parm;           /* # stripes parameter from constructor. */
 351         atomic_t stripes;           /* actual # of stripes in cache. */
 352         atomic_t stripes_to_set;    /* # of stripes to resize cache to. */
 353         atomic_t stripes_last;      /* last # of stripes in cache. */
 354         atomic_t active_stripes;    /* actual # of active stripes in cache. */
 355
 356         /* REMOVEME: */
 357         atomic_t active_stripes_max; /* actual # of active stripes in cache. */
 358 };
 359
 360 /* Flag specs for raid_dev */ ;
 361 enum raid_dev_flags {
 362         DEV_FAILED,     /* Device failed. */
 363         DEV_IO_QUEUED,  /* Io got queued to device. */
 364 };
 365
 366 /* The raid device in a set. */
 367 struct raid_dev {
 368         struct dm_dev *dev;
 369         sector_t start;         /* Offset to map to. */
 370         struct {        /* Using struct to be able to BITOPS(). */
 371                 unsigned long flags;    /* raid_dev_flags. */
 372         } io;
 373 };
 374
 375 BITOPS(Dev, Failed,   raid_dev, DEV_FAILED)
 376 BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
 377
 378 /* Flags spec for raid_set. */
 379 enum raid_set_flags {
 380         RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
 381         RS_DEAD,                /* RAID set inoperational. */
 382         RS_DEAD_ENDIO_MESSAGE,  /* RAID set dead endio one-off message. */
 383         RS_DEGRADED,            /* Io errors on RAID device. */
 384         RS_DEVEL_STATS,         /* REMOVEME: display status information. */
 385         RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
 386         RS_PROHIBIT_WRITES,     /* Prohibit writes on device failure. */
 387         RS_RECOVER,             /* Do recovery. */
 388         RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
 389         RS_SC_BUSY,             /* Stripe cache busy -> send an event. */
 390         RS_SUSPEND,             /* Suspend RAID set. */
 391 };
 392
 393 /* REMOVEME: devel stats counters. */
 394 enum stats_types {
 395         S_BIOS_READ,
 396         S_BIOS_ADDED_READ,
 397         S_BIOS_ENDIO_READ,
 398         S_BIOS_WRITE,
 399         S_BIOS_ADDED_WRITE,
 400         S_BIOS_ENDIO_WRITE,
 401         S_CAN_MERGE,
 402         S_CANT_MERGE,
 403         S_CONGESTED,
 404         S_DM_IO_READ,
 405         S_DM_IO_WRITE,
 406         S_BANDWIDTH,
 407         S_BARRIER,
 408         S_BIO_COPY_PL_NEXT,
 409         S_DEGRADED,
 410         S_DELAYED_BIOS,
 411         S_FLUSHS,
 412         S_HITS_1ST,
 413         S_IOS_POST,
 414         S_INSCACHE,
 415         S_MAX_LOOKUP,
 416         S_CHUNK_LOCKED,
 417         S_NO_BANDWIDTH,
 418         S_NOT_CONGESTED,
 419         S_NO_RW,
 420         S_NOSYNC,
 421         S_OVERWRITE,
 422         S_PROHIBITCHUNKIO,
 423         S_RECONSTRUCT_EI,
 424         S_RECONSTRUCT_DEV,
 425         S_RECONSTRUCT_SET,
 426         S_RECONSTRUCTED,
 427         S_REQUEUE,
 428         S_STRIPE_ERROR,
 429         S_SUM_DELAYED_BIOS,
 430         S_XORS,
 431         S_NR_STATS,     /* # of stats counters. Must be last! */
 432 };
 433
 434 /* Status type -> string mappings. */
 435 struct stats_map {
 436         const enum stats_types type;
 437         const char *str;
 438 };
 439
 440 static struct stats_map stats_map[] = {
 441         { S_BIOS_READ, "r=" },
 442         { S_BIOS_ADDED_READ, "/" },
 443         { S_BIOS_ENDIO_READ, "/" },
 444         { S_BIOS_WRITE, " w=" },
 445         { S_BIOS_ADDED_WRITE, "/" },
 446         { S_BIOS_ENDIO_WRITE, "/" },
 447         { S_DM_IO_READ, " rc=" },
 448         { S_DM_IO_WRITE, " wc=" },
 449         { S_BANDWIDTH, "\nbw=" },
 450         { S_NO_BANDWIDTH, " no_bw=" },
 451         { S_BARRIER, "\nbarrier=" },
 452         { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
 453         { S_CAN_MERGE, "\nmerge=" },
 454         { S_CANT_MERGE, "/no_merge=" },
 455         { S_CHUNK_LOCKED, "\nchunk_locked=" },
 456         { S_CONGESTED, "\ncgst=" },
 457         { S_NOT_CONGESTED, "/not_cgst=" },
 458         { S_DEGRADED, "\ndegraded=" },
 459         { S_DELAYED_BIOS, "\ndel_bios=" },
 460         { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
 461         { S_FLUSHS, "\nflushs=" },
 462         { S_HITS_1ST, "\nhits_1st=" },
 463         { S_IOS_POST, " ios_post=" },
 464         { S_INSCACHE, " inscache=" },
 465         { S_MAX_LOOKUP, " maxlookup=" },
 466         { S_NO_RW, "\nno_rw=" },
 467         { S_NOSYNC, " nosync=" },
 468         { S_OVERWRITE, " ovr=" },
 469         { S_PROHIBITCHUNKIO, " prhbt_io=" },
 470         { S_RECONSTRUCT_EI, "\nrec_ei=" },
 471         { S_RECONSTRUCT_DEV, " rec_dev=" },
 472         { S_RECONSTRUCT_SET, " rec_set=" },
 473         { S_RECONSTRUCTED, " rec=" },
 474         { S_REQUEUE, " requeue=" },
 475         { S_STRIPE_ERROR, " stripe_err=" },
 476         { S_XORS, " xors=" },
 477 };
 478
 479 /*
 480  * A RAID set.
 481  */
 482 #define dm_rh_client    dm_region_hash
 483 enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
 484 typedef void (*xor_function_t)(unsigned count, unsigned long **data);
 485 struct raid_set {
 486         struct dm_target *ti;   /* Target pointer. */
 487
 488         struct {
 489                 unsigned long flags;    /* State flags. */
 490                 struct mutex in_lock;   /* Protects central input list below. */
 491                 struct mutex xor_lock;  /* Protects xor algorithm set. */
 492                 struct bio_list in;     /* Pending ios (central input list). */
 493                 struct bio_list work;   /* ios work set. */
 494                 wait_queue_head_t suspendq;     /* suspend synchronization. */
 495                 atomic_t in_process;    /* counter of queued bios (suspendq). */
 496                 atomic_t in_process_max;/* counter of queued bios max. */
 497
 498                 /* io work. */
 499                 struct workqueue_struct *wq;
 500                 struct delayed_work dws_do_raid;        /* For main worker. */
 501                 struct work_struct ws_do_table_event;   /* For event worker. */
 502         } io;
 503
 504         /* Stripe locking abstraction. */
 505         struct dm_raid45_locking_type *locking;
 506
 507         struct stripe_cache sc; /* Stripe cache for this set. */
 508
 509         /* Xor optimization. */
 510         struct {
 511                 struct xor_func *f;
 512                 unsigned chunks;
 513                 unsigned speed;
 514         } xor;
 515
 516         /* Recovery parameters. */
 517         struct recover {
 518                 struct dm_dirty_log *dl;        /* Dirty log. */
 519                 struct dm_rh_client *rh;        /* Region hash. */
 520
 521                 struct dm_io_client *dm_io_client; /* recovery dm-io client. */
 522                 /* dm-mem-cache client resource context for recovery stripes. */
 523                 struct dm_mem_cache_client *mem_cache_client;
 524
 525                 struct list_head stripes;       /* List of recovery stripes. */
 526
 527                 region_t nr_regions;
 528                 region_t nr_regions_to_recover;
 529                 region_t nr_regions_recovered;
 530                 unsigned long start_jiffies;
 531                 unsigned long end_jiffies;
 532
 533                 unsigned bandwidth;      /* Recovery bandwidth [%]. */
 534                 unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
 535                 unsigned bandwidth_parm; /*  " constructor parm. */
 536                 unsigned io_size;        /* recovery io size <= region size. */
 537                 unsigned io_size_parm;   /* recovery io size ctr parameter. */
 538                 unsigned recovery;       /* Recovery allowed/prohibited. */
 539                 unsigned recovery_stripes; /* # of parallel recovery stripes. */
 540
 541                 /* recovery io throttling. */
 542                 atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
 543                 unsigned long last_jiffies;
 544         } recover;
 545
 546         /* RAID set parameters. */
 547         struct {
 548                 struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
 549                 unsigned raid_parms;    /* # variable raid parameters. */
 550
 551                 unsigned chunk_size;    /* Sectors per chunk. */
 552                 unsigned chunk_size_parm;
 553                 unsigned chunk_shift;   /* rsector chunk size shift. */
 554
 555                 unsigned io_size;       /* Sectors per io. */
 556                 unsigned io_size_parm;
 557                 unsigned io_mask;       /* Mask for bio_copy_page_list(). */
 558                 unsigned io_inv_mask;   /* Mask for raid_address(). */
 559
 560                 sector_t sectors_per_dev;       /* Sectors per device. */
 561
 562                 atomic_t failed_devs;           /* Amount of devices failed. */
 563
 564                 /* Index of device to initialize. */
 565                 int dev_to_init;
 566                 int dev_to_init_parm;
 567
 568                 /* Raid devices dynamically allocated. */
 569                 unsigned raid_devs;     /* # of RAID devices below. */
 570                 unsigned data_devs;     /* # of RAID data devices. */
 571
 572                 int ei;         /* index of failed RAID device. */
 573
 574                 /* Index of dedicated parity device (i.e. RAID4). */
 575                 int pi;
 576                 int pi_parm;    /* constructor parm for status output. */
 577         } set;
 578
 579         /* REMOVEME: devel stats counters. */
 580         atomic_t stats[S_NR_STATS];
 581
 582         /* Dynamically allocated temporary pointers for xor(). */
 583         unsigned long **data;
 584
 585         /* Dynamically allocated RAID devices. Alignment? */
 586         struct raid_dev dev[0];
 587 };
 588
 589 /* Define RAID set bit operations. */
 590 BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
 591 BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
 592 BITOPS(RS, Dead, raid_set, RS_DEAD)
 593 BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
 594 BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
 595 BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
 596 BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
 597 BITOPS(RS, ProhibitWrites, raid_set, RS_PROHIBIT_WRITES)
 598 BITOPS(RS, Recover, raid_set, RS_RECOVER)
 599 BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
 600 BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
 601 #undef BITOPS
 602
 603 /*-----------------------------------------------------------------
 604  * Raid-4/5 set structures.
 605  *---------------------------------------------------------------*/
 606 /* RAID level definitions. */
 607 enum raid_level {
 608         raid4,
 609         raid5,
 610 };
 611
 612 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
 613 enum raid_algorithm {
 614         none,
 615         left_asym,
 616         right_asym,
 617         left_sym,
 618         right_sym,
 619 };
 620
 621 struct raid_type {
 622         const char *name;               /* RAID algorithm. */
 623         const char *descr;              /* Descriptor text for logging. */
 624         const unsigned parity_devs;     /* # of parity devices. */
 625         const unsigned minimal_devs;    /* minimal # of devices in set. */
 626         const enum raid_level level;            /* RAID level. */
 627         const enum raid_algorithm algorithm;    /* RAID algorithm. */
 628 };
 629
 630 /* Supported raid types and properties. */
 631 static struct raid_type raid_types[] = {
 632         {"raid4",    "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
 633         {"raid5_la", "RAID5 (left asymmetric)",       1, 3, raid5, left_asym},
 634         {"raid5_ra", "RAID5 (right asymmetric)",      1, 3, raid5, right_asym},
 635         {"raid5_ls", "RAID5 (left symmetric)",        1, 3, raid5, left_sym},
 636         {"raid5_rs", "RAID5 (right symmetric)",       1, 3, raid5, right_sym},
 637 };
 638
 639 /* Address as calculated by raid_address(). */
 640 struct raid_address {
 641         sector_t key;           /* Hash key (address of stripe % chunk_size). */
 642         unsigned di, pi;        /* Data and parity disks index. */
 643 };
 644
 645 /* REMOVEME: reset statistics counters. */
 646 static void stats_reset(struct raid_set *rs)
 647 {
 648         unsigned s = S_NR_STATS;
 649
 650         while (s--)
 651                 atomic_set(rs->stats + s, 0);
 652 }
 653
 654 /*----------------------------------------------------------------
 655  * RAID set management routines.
 656  *--------------------------------------------------------------*/
 657 /*
 658  * Begin small helper functions.
 659  */
 660 /* No need to be called from region hash indirectly at dm_rh_dec(). */
 661 static void wake_dummy(void *context) {}
 662
 663 /* Return # of io reference. */
 664 static int io_ref(struct raid_set *rs)
 665 {
 666         return atomic_read(&rs->io.in_process);
 667 }
 668
 669 /* Get an io reference. */
 670 static void io_get(struct raid_set *rs)
 671 {
 672         int p = atomic_inc_return(&rs->io.in_process);
 673
 674         if (p > atomic_read(&rs->io.in_process_max))
 675                 atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
 676 }
 677
 678 /* Put the io reference and conditionally wake io waiters. */
 679 static void io_put(struct raid_set *rs)
 680 {
 681         /* Intel: rebuild data corrupter? */
 682         if (atomic_dec_and_test(&rs->io.in_process))
 683                 wake_up(&rs->io.suspendq);
 684         else
 685                 BUG_ON(io_ref(rs) < 0);
 686 }
 687
 688 /* Wait until all io has been processed. */
 689 static void wait_ios(struct raid_set *rs)
 690 {
 691         wait_event(rs->io.suspendq, !io_ref(rs));
 692 }
 693
 694 /* Queue (optionally delayed) io work. */
 695 static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
 696 {
 697         queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
 698 }
 699
 700 /* Queue io work immediately (called from region hash too). */
 701 static void wake_do_raid(void *context)
 702 {
 703         struct raid_set *rs = context;
 704
 705         queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
 706 }
 707
 708 /* Calculate device sector offset. */
 709 static sector_t _sector(struct raid_set *rs, struct bio *bio)
 710 {
 711         sector_t sector = bio->bi_sector;
 712
 713         sector_div(sector, rs->set.data_devs);
 714         return sector;
 715 }
 716
 717 /* Return # of active stripes in stripe cache. */
 718 static int sc_active(struct stripe_cache *sc)
 719 {
 720         return atomic_read(&sc->active_stripes);
 721 }
 722
 723 /* Stripe cache busy indicator. */
 724 static int sc_busy(struct raid_set *rs)
 725 {
 726         return sc_active(&rs->sc) >
 727                atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
 728 }
 729
 730 /* Set chunks states. */
 731 enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
 732 static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
 733 {
 734         switch (type) {
 735         case CLEAN:
 736                 ClearChunkDirty(chunk);
 737                 break;
 738         case DIRTY:
 739                 SetChunkDirty(chunk);
 740                 break;
 741         case ERROR:
 742                 SetChunkError(chunk);
 743                 SetStripeError(chunk->stripe);
 744                 return;
 745         default:
 746                 BUG();
 747         }
 748
 749         SetChunkUptodate(chunk);
 750         SetChunkIo(chunk);
 751         ClearChunkError(chunk);
 752 }
 753
 754 /* Return region state for a sector. */
 755 static int region_state(struct raid_set *rs, sector_t sector,
 756                         enum dm_rh_region_states state)
 757 {
 758         struct dm_rh_client *rh = rs->recover.rh;
 759         region_t region = dm_rh_sector_to_region(rh, sector);
 760
 761         return !!(dm_rh_get_state(rh, region, 1) & state);
 762 }
 763
 764 /*
 765  * Return true in case a chunk should be read/written
 766  *
 767  * Conditions to read/write:
 768  *      o chunk not uptodate
 769  *      o chunk dirty
 770  *
 771  * Conditios to avoid io:
 772  *      o io already ongoing on chunk
 773  *      o io explitely prohibited
 774  */
 775 static int chunk_io(struct stripe_chunk *chunk)
 776 {
 777         /* 2nd run optimization (flag set below on first run). */
 778         if (TestClearChunkMustIo(chunk))
 779                 return 1;
 780
 781         /* Avoid io if prohibited or a locked chunk. */
 782         if (!ChunkIo(chunk) || ChunkLocked(chunk))
 783                 return 0;
 784
 785         if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
 786                 SetChunkMustIo(chunk); /* 2nd run optimization. */
 787                 return 1;
 788         }
 789
 790         return 0;
 791 }
 792
 793 /* Call a function on each chunk needing io unless device failed. */
 794 static unsigned for_each_io_dev(struct stripe *stripe,
 795                                 void (*f_io)(struct stripe *stripe, unsigned p))
 796 {
 797         struct raid_set *rs = RS(stripe->sc);
 798         unsigned p, r = 0;
 799
 800         for (p = 0; p < rs->set.raid_devs; p++) {
 801                 if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
 802                         f_io(stripe, p);
 803                         r++;
 804                 }
 805         }
 806
 807         return r;
 808 }
 809
 810 /*
 811  * Index of device to calculate parity on.
 812  *
 813  * Either the parity device index *or* the selected
 814  * device to init after a spare replacement.
 815  */
 816 static int dev_for_parity(struct stripe *stripe, int *sync)
 817 {
 818         struct raid_set *rs = RS(stripe->sc);
 819         int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
 820
 821         *sync = !r;
 822
 823         /* Reconstruct a particular device ?. */
 824         if (r && rs->set.dev_to_init > -1)
 825                 return rs->set.dev_to_init;
 826         else if (rs->set.raid_type->level == raid4)
 827                 return rs->set.pi;
 828         else if (!StripeRecover(stripe))
 829                 return stripe->idx.parity;
 830         else
 831                 return -1;
 832 }
 833
 834 /* RAID set congested function. */
 835 static int rs_congested(void *congested_data, int bdi_bits)
 836 {
 837         int r;
 838         unsigned p;
 839         struct raid_set *rs = congested_data;
 840
 841         if (sc_busy(rs) || RSSuspend(rs) || RSProhibitWrites(rs))
 842                 r = 1;
 843         else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
 844                 /* If any of our component devices are overloaded. */
 845                 struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
 846
 847                 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 848         }
 849
 850         /* REMOVEME: statistics. */
 851         atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
 852         return r;
 853 }
 854
 855 /* RAID device degrade check. */
 856 static void rs_check_degrade_dev(struct raid_set *rs,
 857                                  struct stripe *stripe, unsigned p)
 858 {
 859         if (TestSetDevFailed(rs->dev + p))
 860                 return;
 861
 862         /* Through an event in case of member device errors. */
 863         if ((atomic_inc_return(&rs->set.failed_devs) >
 864              rs->set.raid_type->parity_devs) &&
 865              !TestSetRSDead(rs)) {
 866                 /* Display RAID set dead message once. */
 867                 unsigned p;
 868                 char buf[BDEVNAME_SIZE];
 869
 870                 DMERR("FATAL: too many devices failed -> RAID set broken");
 871                 for (p = 0; p < rs->set.raid_devs; p++) {
 872                         if (DevFailed(rs->dev + p))
 873                                 DMERR("device /dev/%s failed",
 874                                       bdevname(rs->dev[p].dev->bdev, buf));
 875                 }
 876         }
 877
 878         /* Only log the first member error. */
 879         if (!TestSetRSDegraded(rs)) {
 880                 char buf[BDEVNAME_SIZE];
 881
 882                 /* Store index for recovery. */
 883                 rs->set.ei = p;
 884                 DMERR("CRITICAL: %sio error on device /dev/%s "
 885                       "in region=%llu; DEGRADING RAID set\n",
 886                       stripe ? "" : "FAKED ",
 887                       bdevname(rs->dev[p].dev->bdev, buf),
 888                       (unsigned long long) (stripe ? stripe->key : 0));
 889                 DMERR("further device error messages suppressed");
 890         }
 891
 892         /* Prohibit further writes to allow for userpace to update metadata. */
 893         SetRSProhibitWrites(rs);
 894         schedule_work(&rs->io.ws_do_table_event);
 895 }
 896
 897 /* RAID set degrade check. */
 898 static void rs_check_degrade(struct stripe *stripe)
 899 {
 900         struct raid_set *rs = RS(stripe->sc);
 901         unsigned p = rs->set.raid_devs;
 902
 903         while (p--) {
 904                 if (ChunkError(CHUNK(stripe, p)))
 905                         rs_check_degrade_dev(rs, stripe, p);
 906         }
 907 }
 908
 909 /* Lookup a RAID device by name or by major:minor number. */
 910 static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
 911 {
 912         unsigned p;
 913         struct raid_dev *dev;
 914
 915         /*
 916          * Must be an incremental loop, because the device array
 917          * can have empty slots still on calls from raid_ctr()
 918          */
 919         for (dev = rs->dev, p = 0;
 920              dev->dev && p < rs->set.raid_devs;
 921              dev++, p++) {
 922                 if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
 923                         return p;
 924         }
 925
 926         return -ENODEV;
 927 }
 928 /*
 929  * End small helper functions.
 930  */
 931
 932 /*
 933  * Stripe hash functions
 934  */
 935 /* Initialize/destroy stripe hash. */
 936 static int hash_init(struct stripe_hash *hash, unsigned stripes)
 937 {
 938         unsigned buckets = roundup_pow_of_two(stripes >> 1);
 939         static unsigned hash_primes[] = {
 940                 /* Table of primes for hash_fn/table size optimization. */
 941                 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
 942                 1543, 3079, 6151, 12289, 24593, 49157, 98317,
 943         };
 944
 945         /* Allocate stripe hash buckets. */
 946         hash->hash = vmalloc(buckets * sizeof(*hash->hash));
 947         if (!hash->hash)
 948                 return -ENOMEM;
 949
 950         hash->buckets = buckets;
 951         hash->mask = buckets - 1;
 952         hash->shift = ffs(buckets);
 953         if (hash->shift > ARRAY_SIZE(hash_primes))
 954                 hash->shift = ARRAY_SIZE(hash_primes) - 1;
 955
 956         BUG_ON(hash->shift < 2);
 957         hash->prime = hash_primes[hash->shift];
 958
 959         /* Initialize buckets. */
 960         while (buckets--)
 961                 INIT_LIST_HEAD(hash->hash + buckets);
 962         return 0;
 963 }
 964
 965 static void hash_exit(struct stripe_hash *hash)
 966 {
 967         if (hash->hash) {
 968                 vfree(hash->hash);
 969                 hash->hash = NULL;
 970         }
 971 }
 972
 973 static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
 974 {
 975         return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
 976 }
 977
 978 static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
 979 {
 980         return hash->hash + hash_fn(hash, key);
 981 }
 982
 983 /* Insert an entry into a hash. */
 984 static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
 985 {
 986         list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
 987 }
 988
 989 /* Lookup an entry in the stripe hash. */
 990 static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
 991 {
 992         unsigned look = 0;
 993         struct stripe *stripe;
 994         struct list_head *bucket = hash_bucket(&sc->hash, key);
 995
 996         list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
 997                 look++;
 998
 999                 if (stripe->key == key) {
1000                         /* REMOVEME: statisics. */
1001                         if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
1002                                 atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
1003                         return stripe;
1004                 }
1005         }
1006
1007         return NULL;
1008 }
1009
1010 /* Resize the stripe cache hash on size changes. */
1011 static int sc_hash_resize(struct stripe_cache *sc)
1012 {
1013         /* Resize indicated ? */
1014         if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
1015                 int r;
1016                 struct stripe_hash hash;
1017
1018                 r = hash_init(&hash, atomic_read(&sc->stripes));
1019                 if (r)
1020                         return r;
1021
1022                 if (sc->hash.hash) {
1023                         unsigned b = sc->hash.buckets;
1024                         struct list_head *pos, *tmp;
1025
1026                         /* Walk old buckets and insert into new. */
1027                         while (b--) {
1028                                 list_for_each_safe(pos, tmp, sc->hash.hash + b)
1029                                     stripe_insert(&hash,
1030                                                   list_entry(pos, struct stripe,
1031                                                              lists[LIST_HASH]));
1032                         }
1033
1034                 }
1035
1036                 hash_exit(&sc->hash);
1037                 memcpy(&sc->hash, &hash, sizeof(sc->hash));
1038                 atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1039         }
1040
1041         return 0;
1042 }
1043 /* End hash stripe hash function. */
1044
1045 /* List add, delete, push and pop functions. */
1046 /* Add stripe to flush list. */
1047 #define DEL_LIST(lh) \
1048         if (!list_empty(lh)) \
1049                 list_del_init(lh);
1050
1051 /* Delete stripe from hash. */
1052 static void stripe_hash_del(struct stripe *stripe)
1053 {
1054         DEL_LIST(stripe->lists + LIST_HASH);
1055 }
1056
1057 /* Return stripe reference count. */
1058 static inline int stripe_ref(struct stripe *stripe)
1059 {
1060         return atomic_read(&stripe->cnt);
1061 }
1062
1063 static void stripe_flush_add(struct stripe *stripe)
1064 {
1065         struct stripe_cache *sc = stripe->sc;
1066         struct list_head *lh = stripe->lists + LIST_FLUSH;
1067
1068         if (!StripeReconstruct(stripe) && list_empty(lh))
1069                 list_add_tail(lh, sc->lists + LIST_FLUSH);
1070 }
1071
1072 /*
1073  * Add stripe to LRU (inactive) list.
1074  *
1075  * Need lock, because of concurrent access from message interface.
1076  */
1077 static void stripe_lru_add(struct stripe *stripe)
1078 {
1079         if (!StripeRecover(stripe)) {
1080                 struct list_head *lh = stripe->lists + LIST_LRU;
1081
1082                 if (list_empty(lh))
1083                         list_add_tail(lh, stripe->sc->lists + LIST_LRU);
1084         }
1085 }
1086
1087 #define POP_LIST(list) \
1088         do { \
1089                 if (list_empty(sc->lists + (list))) \
1090                         stripe = NULL; \
1091                 else { \
1092                         stripe = list_first_entry(sc->lists + (list), \
1093                                                   struct stripe, \
1094                                                   lists[(list)]); \
1095                         list_del_init(stripe->lists + (list)); \
1096                 } \
1097         } while (0);
1098
1099 /* Pop an available stripe off the LRU list. */
1100 static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1101 {
1102         struct stripe *stripe;
1103
1104         POP_LIST(LIST_LRU);
1105         return stripe;
1106 }
1107
1108 /* Pop an available stripe off the io list. */
1109 static struct stripe *stripe_io_pop(struct stripe_cache *sc)
1110 {
1111         struct stripe *stripe;
1112
1113         POP_LIST(LIST_FLUSH);
1114         return stripe;
1115 }
1116
1117 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1118 static void stripe_endio_push(struct stripe *stripe)
1119 {
1120         unsigned long flags;
1121         struct stripe_cache *sc = stripe->sc;
1122         struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
1123                          *sc_list = sc->lists + LIST_ENDIO;
1124         spinlock_t *lock = sc->locks + LOCK_ENDIO;
1125
1126         /* This runs in parallel with do_endios(). */
1127         spin_lock_irqsave(lock, flags);
1128         if (list_empty(stripe_list))
1129                 list_add_tail(stripe_list, sc_list);
1130         spin_unlock_irqrestore(lock, flags);
1131
1132         wake_do_raid(RS(sc)); /* Wake myself. */
1133 }
1134
1135 /* Pop a stripe off safely off the endio list. */
1136 static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
1137 {
1138         struct stripe *stripe;
1139         spinlock_t *lock = sc->locks + LOCK_ENDIO;
1140
1141         /* This runs in parallel with endio(). */
1142         spin_lock_irq(lock);
1143         POP_LIST(LIST_ENDIO)
1144         spin_unlock_irq(lock);
1145         return stripe;
1146 }
1147 #undef POP_LIST
1148
1149 /*
1150  * Stripe cache locking functions
1151  */
1152 /* Dummy lock function for single host RAID4+5. */
1153 static void *no_lock(sector_t key, enum dm_lock_type type)
1154 {
1155         return &no_lock;
1156 }
1157
1158 /* Dummy unlock function for single host RAID4+5. */
1159 static void no_unlock(void *lock_handle)
1160 {
1161 }
1162
1163 /* No locking (for single host RAID 4+5). */
1164 static struct dm_raid45_locking_type locking_none = {
1165         .lock = no_lock,
1166         .unlock = no_unlock,
1167 };
1168
1169 /* Lock a stripe (for clustering). */
1170 static int
1171 stripe_lock(struct stripe *stripe, int rw, sector_t key)
1172 {
1173         stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
1174         return stripe->lock ? 0 : -EPERM;
1175 }
1176
1177 /* Unlock a stripe (for clustering). */
1178 static void stripe_unlock(struct stripe *stripe)
1179 {
1180         RS(stripe->sc)->locking->unlock(stripe->lock);
1181         stripe->lock = NULL;
1182 }
1183
1184 /* Test io pending on stripe. */
1185 static int stripe_io_ref(struct stripe *stripe)
1186 {
1187         return atomic_read(&stripe->io.pending);
1188 }
1189
1190 static void stripe_io_get(struct stripe *stripe)
1191 {
1192         if (atomic_inc_return(&stripe->io.pending) == 1)
1193                 /* REMOVEME: statistics */
1194                 atomic_inc(&stripe->sc->active_stripes);
1195         else
1196                 BUG_ON(stripe_io_ref(stripe) < 0);
1197 }
1198
1199 static void stripe_io_put(struct stripe *stripe)
1200 {
1201         if (atomic_dec_and_test(&stripe->io.pending)) {
1202                 if (unlikely(StripeRecover(stripe)))
1203                         /* Don't put recovery stripe on endio list. */
1204                         wake_do_raid(RS(stripe->sc));
1205                 else
1206                         /* Add regular stripe to endio list and wake daemon. */
1207                         stripe_endio_push(stripe);
1208
1209                 /* REMOVEME: statistics */
1210                 atomic_dec(&stripe->sc->active_stripes);
1211         } else
1212                 BUG_ON(stripe_io_ref(stripe) < 0);
1213 }
1214
1215 /* Take stripe reference out. */
1216 static int stripe_get(struct stripe *stripe)
1217 {
1218         int r;
1219         struct list_head *lh = stripe->lists + LIST_LRU;
1220
1221         /* Delete stripe from LRU (inactive) list if on. */
1222         DEL_LIST(lh);
1223         BUG_ON(stripe_ref(stripe) < 0);
1224
1225         /* Lock stripe on first reference */
1226         r = (atomic_inc_return(&stripe->cnt) == 1) ?
1227             stripe_lock(stripe, WRITE, stripe->key) : 0;
1228
1229         return r;
1230 }
1231 #undef DEL_LIST
1232
1233 /* Return references on a chunk. */
1234 static int chunk_ref(struct stripe_chunk *chunk)
1235 {
1236         return atomic_read(&chunk->cnt);
1237 }
1238
1239 /* Take out reference on a chunk. */
1240 static int chunk_get(struct stripe_chunk *chunk)
1241 {
1242         return atomic_inc_return(&chunk->cnt);
1243 }
1244
1245 /* Drop reference on a chunk. */
1246 static void chunk_put(struct stripe_chunk *chunk)
1247 {
1248         BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
1249 }
1250
1251 /*
1252  * Drop reference on a stripe.
1253  *
1254  * Move it to list of LRU stripes if zero.
1255  */
1256 static void stripe_put(struct stripe *stripe)
1257 {
1258         if (atomic_dec_and_test(&stripe->cnt)) {
1259                 BUG_ON(stripe_io_ref(stripe));
1260                 stripe_unlock(stripe);
1261         } else
1262                 BUG_ON(stripe_ref(stripe) < 0);
1263 }
1264
1265 /* Helper needed by for_each_io_dev(). */
1266 static void stripe_get_references(struct stripe *stripe, unsigned p)
1267 {
1268
1269         /*
1270          * Another one to reference the stripe in
1271          * order to protect vs. LRU list moves.
1272          */
1273         io_get(RS(stripe->sc)); /* Global io references. */
1274         stripe_get(stripe);
1275         stripe_io_get(stripe);  /* One for each chunk io. */
1276 }
1277
1278 /* Helper for endio() to put all take references. */
1279 static void stripe_put_references(struct stripe *stripe)
1280 {
1281         stripe_io_put(stripe);  /* One for each chunk io. */
1282         stripe_put(stripe);
1283         io_put(RS(stripe->sc));
1284 }
1285
1286 /*
1287  * Stripe cache functions.
1288  */
1289 /*
1290  * Invalidate all chunks (i.e. their pages)  of a stripe.
1291  *
1292  * I only keep state for the whole chunk.
1293  */
1294 static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
1295 {
1296         chunk->io.flags = 0;
1297 }
1298
1299 static void
1300 stripe_chunks_invalidate(struct stripe *stripe)
1301 {
1302         unsigned p = RS(stripe->sc)->set.raid_devs;
1303
1304         while (p--)
1305                 stripe_chunk_invalidate(CHUNK(stripe, p));
1306 }
1307
1308 /* Prepare stripe for (re)use. */
1309 static void stripe_invalidate(struct stripe *stripe)
1310 {
1311         stripe->io.flags = 0;
1312         stripe->idx.parity = stripe->idx.recover = -1;
1313         stripe_chunks_invalidate(stripe);
1314 }
1315
1316 /*
1317  * Allow io on all chunks of a stripe.
1318  * If not set, IO will not occur; i.e. it's prohibited.
1319  *
1320  * Actual IO submission for allowed chunks depends
1321  * on their !uptodate or dirty state.
1322  */
1323 static void stripe_allow_io(struct stripe *stripe)
1324 {
1325         unsigned p = RS(stripe->sc)->set.raid_devs;
1326
1327         while (p--)
1328                 SetChunkIo(CHUNK(stripe, p));
1329 }
1330
1331 /* Initialize a stripe. */
1332 static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1333 {
1334         unsigned i, p = RS(sc)->set.raid_devs;
1335
1336         /* Work all io chunks. */
1337         while (p--) {
1338                 struct stripe_chunk *chunk = CHUNK(stripe, p);
1339
1340                 atomic_set(&chunk->cnt, 0);
1341                 chunk->stripe = stripe;
1342                 i = ARRAY_SIZE(chunk->bl);
1343                 while (i--)
1344                         bio_list_init(chunk->bl + i);
1345         }
1346
1347         stripe->sc = sc;
1348
1349         i = ARRAY_SIZE(stripe->lists);
1350         while (i--)
1351                 INIT_LIST_HEAD(stripe->lists + i);
1352
1353         stripe->io.size = RS(sc)->set.io_size;
1354         atomic_set(&stripe->cnt, 0);
1355         atomic_set(&stripe->io.pending, 0);
1356         stripe_invalidate(stripe);
1357 }
1358
1359 /* Number of pages per chunk. */
1360 static inline unsigned chunk_pages(unsigned sectors)
1361 {
1362         return dm_div_up(sectors, SECTORS_PER_PAGE);
1363 }
1364
1365 /* Number of pages per stripe. */
1366 static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
1367 {
1368         return chunk_pages(io_size) * rs->set.raid_devs;
1369 }
1370
1371 /* Initialize part of page_list (recovery). */
1372 static void stripe_zero_pl_part(struct stripe *stripe, int p,
1373                                 unsigned start, unsigned count)
1374 {
1375         unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
1376         /* Get offset into the page_list. */
1377         struct page_list *pl = pl_elem(PL(stripe, p), o);
1378
1379         BUG_ON(!pl);
1380         while (pl && pages--) {
1381                 BUG_ON(!pl->page);
1382                 memset(page_address(pl->page), 0, PAGE_SIZE);
1383                 pl = pl->next;
1384         }
1385 }
1386
1387 /* Initialize parity chunk of stripe. */
1388 static void stripe_zero_chunk(struct stripe *stripe, int p)
1389 {
1390         if (p > -1)
1391                 stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
1392 }
1393
1394 /* Return dynamic stripe structure size. */
1395 static size_t stripe_size(struct raid_set *rs)
1396 {
1397         return sizeof(struct stripe) +
1398                       rs->set.raid_devs * sizeof(struct stripe_chunk);
1399 }
1400
1401 /* Allocate a stripe and its memory object. */
1402 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1403 enum grow { SC_GROW, SC_KEEP };
1404 static struct stripe *stripe_alloc(struct stripe_cache *sc,
1405                                    struct dm_mem_cache_client *mc,
1406                                    enum grow grow)
1407 {
1408         int r;
1409         struct stripe *stripe;
1410
1411         stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
1412         if (stripe) {
1413                 /* Grow the dm-mem-cache by one object. */
1414                 if (grow == SC_GROW) {
1415                         r = dm_mem_cache_grow(mc, 1);
1416                         if (r)
1417                                 goto err_free;
1418                 }
1419
1420                 stripe->obj = dm_mem_cache_alloc(mc);
1421                 if (IS_ERR(stripe->obj))
1422                         goto err_shrink;
1423
1424                 stripe_init(sc, stripe);
1425         }
1426
1427         return stripe;
1428
1429 err_shrink:
1430         if (grow == SC_GROW)
1431                 dm_mem_cache_shrink(mc, 1);
1432 err_free:
1433         kmem_cache_free(sc->kc.cache, stripe);
1434         return NULL;
1435 }
1436
1437 /*
1438  * Free a stripes memory object, shrink the
1439  * memory cache and free the stripe itself.
1440  */
1441 static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
1442 {
1443         dm_mem_cache_free(mc, stripe->obj);
1444         dm_mem_cache_shrink(mc, 1);
1445         kmem_cache_free(stripe->sc->kc.cache, stripe);
1446 }
1447
1448 /* Free the recovery stripe. */
1449 static void stripe_recover_free(struct raid_set *rs)
1450 {
1451         struct recover *rec = &rs->recover;
1452         struct dm_mem_cache_client *mc;
1453
1454         mc = rec->mem_cache_client;
1455         rec->mem_cache_client = NULL;
1456         if (mc) {
1457                 struct stripe *stripe;
1458
1459                 while (!list_empty(&rec->stripes)) {
1460                         stripe = list_first_entry(&rec->stripes, struct stripe,
1461                                                   lists[LIST_RECOVER]);
1462                         list_del(stripe->lists + LIST_RECOVER);
1463                         kfree(stripe->recover);
1464                         stripe_free(stripe, mc);
1465                 }
1466
1467                 dm_mem_cache_client_destroy(mc);
1468                 dm_io_client_destroy(rec->dm_io_client);
1469                 rec->dm_io_client = NULL;
1470         }
1471 }
1472
1473 /* Grow stripe cache. */
1474 static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
1475 {
1476         int r = 0;
1477
1478         /* Try to allocate this many (additional) stripes. */
1479         while (stripes--) {
1480                 struct stripe *stripe =
1481                         stripe_alloc(sc, sc->mem_cache_client, grow);
1482
1483                 if (likely(stripe)) {
1484                         stripe_lru_add(stripe);
1485                         atomic_inc(&sc->stripes);
1486                 } else {
1487                         r = -ENOMEM;
1488                         break;
1489                 }
1490         }
1491
1492         return r ? r : sc_hash_resize(sc);
1493 }
1494
1495 /* Shrink stripe cache. */
1496 static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
1497 {
1498         int r = 0;
1499
1500         /* Try to get unused stripe from LRU list. */
1501         while (stripes--) {
1502                 struct stripe *stripe;
1503
1504                 stripe = stripe_lru_pop(sc);
1505                 if (stripe) {
1506                         /* An LRU stripe may never have ios pending! */
1507                         BUG_ON(stripe_io_ref(stripe));
1508                         BUG_ON(stripe_ref(stripe));
1509                         atomic_dec(&sc->stripes);
1510                         /* Remove from hash if on before deletion. */
1511                         stripe_hash_del(stripe);
1512                         stripe_free(stripe, sc->mem_cache_client);
1513                 } else {
1514                         r = -ENOENT;
1515                         break;
1516                 }
1517         }
1518
1519         /* Check if stats are still sane. */
1520         if (atomic_read(&sc->active_stripes_max) >
1521             atomic_read(&sc->stripes))
1522                 atomic_set(&sc->active_stripes_max, 0);
1523
1524         if (r)
1525                 return r;
1526
1527         return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
1528 }
1529
1530 /* Create stripe cache and recovery. */
1531 static int sc_init(struct raid_set *rs, unsigned stripes)
1532 {
1533         unsigned i, r, rstripes;
1534         struct stripe_cache *sc = &rs->sc;
1535         struct stripe *stripe;
1536         struct recover *rec = &rs->recover;
1537         struct mapped_device *md;
1538         struct gendisk *disk;
1539
1540
1541         /* Initialize lists and locks. */
1542         i = ARRAY_SIZE(sc->lists);
1543         while (i--)
1544                 INIT_LIST_HEAD(sc->lists + i);
1545
1546         INIT_LIST_HEAD(&rec->stripes);
1547
1548         /* Initialize endio and LRU list locks. */
1549         i = NR_LOCKS;
1550         while (i--)
1551                 spin_lock_init(sc->locks + i);
1552
1553         /* Initialize atomic variables. */
1554         atomic_set(&sc->stripes, 0);
1555         atomic_set(&sc->stripes_to_set, 0);
1556         atomic_set(&sc->active_stripes, 0);
1557         atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
1558
1559         /*
1560          * We need a runtime unique # to suffix the kmem cache name
1561          * because we'll have one for each active RAID set.
1562          */
1563         md = dm_table_get_md(rs->ti->table);
1564         disk = dm_disk(md);
1565         snprintf(sc->kc.name, sizeof(sc->kc.name), "%s-%d.%d", TARGET,
1566                  disk->first_minor, atomic_inc_return(&_stripe_sc_nr));
1567         sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
1568                                          0, 0, NULL);
1569         if (!sc->kc.cache)
1570                 return -ENOMEM;
1571
1572         /* Create memory cache client context for RAID stripe cache. */
1573         sc->mem_cache_client =
1574                 dm_mem_cache_client_create(stripes, rs->set.raid_devs,
1575                                            chunk_pages(rs->set.io_size));
1576         if (IS_ERR(sc->mem_cache_client))
1577                 return PTR_ERR(sc->mem_cache_client);
1578
1579         /* Create memory cache client context for RAID recovery stripe(s). */
1580         rstripes = rec->recovery_stripes;
1581         rec->mem_cache_client =
1582                 dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
1583                                            chunk_pages(rec->io_size));
1584         if (IS_ERR(rec->mem_cache_client))
1585                 return PTR_ERR(rec->mem_cache_client);
1586
1587         /* Create dm-io client context for IO stripes. */
1588         sc->dm_io_client = dm_io_client_create();
1589         if (IS_ERR(sc->dm_io_client))
1590                 return PTR_ERR(sc->dm_io_client);
1591
1592         /* FIXME: intermingeled with stripe cache initialization. */
1593         /* Create dm-io client context for recovery stripes. */
1594         rec->dm_io_client = dm_io_client_create();
1595         if (IS_ERR(rec->dm_io_client))
1596                 return PTR_ERR(rec->dm_io_client);
1597
1598         /* Allocate stripes for set recovery. */
1599         while (rstripes--) {
1600                 stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
1601                 if (!stripe)
1602                         return -ENOMEM;
1603
1604                 stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
1605                 if (!stripe->recover) {
1606                         stripe_free(stripe, rec->mem_cache_client);
1607                         return -ENOMEM;
1608                 }
1609
1610                 SetStripeRecover(stripe);
1611                 stripe->io.size = rec->io_size;
1612                 list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
1613                 /* Don't add recovery stripes to LRU list! */
1614         }
1615
1616         /*
1617          * Allocate the stripe objetcs from the
1618          * cache and add them to the LRU list.
1619          */
1620         r = sc_grow(sc, stripes, SC_KEEP);
1621         if (!r)
1622                 atomic_set(&sc->stripes_last, stripes);
1623
1624         return r;
1625 }
1626
1627 /* Destroy the stripe cache. */
1628 static void sc_exit(struct stripe_cache *sc)
1629 {
1630         struct raid_set *rs = RS(sc);
1631
1632         if (sc->kc.cache) {
1633                 stripe_recover_free(rs);
1634                 BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
1635                 kmem_cache_destroy(sc->kc.cache);
1636                 sc->kc.cache = NULL;
1637
1638                 if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
1639                         dm_mem_cache_client_destroy(sc->mem_cache_client);
1640
1641                 if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
1642                         dm_io_client_destroy(sc->dm_io_client);
1643
1644                 hash_exit(&sc->hash);
1645         }
1646 }
1647
1648 /*
1649  * Calculate RAID address
1650  *
1651  * Delivers tuple with the index of the data disk holding the chunk
1652  * in the set, the parity disks index and the start of the stripe
1653  * within the address space of the set (used as the stripe cache hash key).
1654  */
1655 /* thx MD. */
1656 static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
1657                                          struct raid_address *addr)
1658 {
1659         sector_t stripe, tmp;
1660
1661         /*
1662          * chunk_number = sector / chunk_size
1663          * stripe_number = chunk_number / data_devs
1664          * di = stripe % data_devs;
1665          */
1666         stripe = sector >> rs->set.chunk_shift;
1667         addr->di = sector_div(stripe, rs->set.data_devs);
1668
1669         switch (rs->set.raid_type->level) {
1670         case raid4:
1671                 addr->pi = rs->set.pi;
1672                 goto check_shift_di;
1673         case raid5:
1674                 tmp = stripe;
1675                 addr->pi = sector_div(tmp, rs->set.raid_devs);
1676
1677                 switch (rs->set.raid_type->algorithm) {
1678                 case left_asym:         /* Left asymmetric. */
1679                         addr->pi = rs->set.data_devs - addr->pi;
1680                 case right_asym:        /* Right asymmetric. */
1681 check_shift_di:
1682                         if (addr->di >= addr->pi)
1683                                 addr->di++;
1684                         break;
1685                 case left_sym:          /* Left symmetric. */
1686                         addr->pi = rs->set.data_devs - addr->pi;
1687                 case right_sym:         /* Right symmetric. */
1688                         addr->di = (addr->pi + addr->di + 1) %
1689                                    rs->set.raid_devs;
1690                         break;
1691                 case none: /* Ain't happen: RAID4 algorithm placeholder. */
1692                         BUG();
1693                 }
1694         }
1695
1696         /*
1697          * Start offset of the stripes chunk on any single device of the RAID
1698          * set, adjusted in case io size differs from chunk size.
1699          */
1700         addr->key = (stripe << rs->set.chunk_shift) +
1701                     (sector & rs->set.io_inv_mask);
1702         return addr;
1703 }
1704
1705 /*
1706  * Copy data across between stripe pages and bio vectors.
1707  *
1708  * Pay attention to data alignment in stripe and bio pages.
1709  */
1710 static void bio_copy_page_list(int rw, struct stripe *stripe,
1711                                struct page_list *pl, struct bio *bio)
1712 {
1713         unsigned i, page_offset;
1714         void *page_addr;
1715         struct raid_set *rs = RS(stripe->sc);
1716         struct bio_vec *bv;
1717
1718         /* Get start page in page list for this sector. */
1719         i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
1720         pl = pl_elem(pl, i);
1721         BUG_ON(!pl);
1722         BUG_ON(!pl->page);
1723
1724         page_addr = page_address(pl->page);
1725         page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
1726
1727         /* Walk all segments and copy data across between bio_vecs and pages. */
1728         bio_for_each_segment(bv, bio, i) {
1729                 int len = bv->bv_len, size;
1730                 unsigned bio_offset = 0;
1731                 void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
1732 redo:
1733                 size = (page_offset + len > PAGE_SIZE) ?
1734                        PAGE_SIZE - page_offset : len;
1735
1736                 if (rw == READ)
1737                         memcpy(bio_addr + bio_offset,
1738                                page_addr + page_offset, size);
1739                 else
1740                         memcpy(page_addr + page_offset,
1741                                bio_addr + bio_offset, size);
1742
1743                 page_offset += size;
1744                 if (page_offset == PAGE_SIZE) {
1745                         /*
1746                          * We reached the end of the chunk page ->
1747                          * need to refer to the next one to copy more data.
1748                          */
1749                         len -= size;
1750                         if (len) {
1751                                 /* Get next page. */
1752                                 pl = pl->next;
1753                                 BUG_ON(!pl);
1754                                 BUG_ON(!pl->page);
1755                                 page_addr = page_address(pl->page);
1756                                 page_offset = 0;
1757                                 bio_offset += size;
1758                                 /* REMOVEME: statistics. */
1759                                 atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
1760                                 goto redo;
1761                         }
1762                 }
1763
1764                 __bio_kunmap_atomic(bio_addr, KM_USER0);
1765         }
1766 }
1767
1768 /*
1769  * Xor optimization macros.
1770  */
1771 /* Xor data pointer declaration and initialization macros. */
1772 #define DECLARE_2       unsigned long *d0 = data[0], *d1 = data[1]
1773 #define DECLARE_3       DECLARE_2, *d2 = data[2]
1774 #define DECLARE_4       DECLARE_3, *d3 = data[3]
1775 #define DECLARE_5       DECLARE_4, *d4 = data[4]
1776 #define DECLARE_6       DECLARE_5, *d5 = data[5]
1777 #define DECLARE_7       DECLARE_6, *d6 = data[6]
1778 #define DECLARE_8       DECLARE_7, *d7 = data[7]
1779
1780 /* Xor unrole macros. */
1781 #define D2(n)   d0[n] = d0[n] ^ d1[n]
1782 #define D3(n)   D2(n) ^ d2[n]
1783 #define D4(n)   D3(n) ^ d3[n]
1784 #define D5(n)   D4(n) ^ d4[n]
1785 #define D6(n)   D5(n) ^ d5[n]
1786 #define D7(n)   D6(n) ^ d6[n]
1787 #define D8(n)   D7(n) ^ d7[n]
1788
1789 #define X_2(macro, offset)      macro(offset); macro(offset + 1);
1790 #define X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
1791 #define X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
1792 #define X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
1793 #define X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
1794 #define X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
1795
1796 /* Define a _xor_#chunks_#xors_per_run() function. */
1797 #define _XOR(chunks, xors_per_run) \
1798 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1799 { \
1800         unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1801         DECLARE_ ## chunks; \
1802 \
1803         for (i = 0; i < end; i += xors_per_run) { \
1804                 X_ ## xors_per_run(D ## chunks, i); \
1805         } \
1806 }
1807
1808 /* Define xor functions for 2 - 8 chunks and xors per run. */
1809 #define MAKE_XOR_PER_RUN(xors_per_run) \
1810         _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1811         _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1812         _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1813         _XOR(8, xors_per_run);
1814
1815 MAKE_XOR_PER_RUN(8)     /* Define _xor_*_8() functions. */
1816 MAKE_XOR_PER_RUN(16)    /* Define _xor_*_16() functions. */
1817 MAKE_XOR_PER_RUN(32)    /* Define _xor_*_32() functions. */
1818 MAKE_XOR_PER_RUN(64)    /* Define _xor_*_64() functions. */
1819
1820 #define MAKE_XOR(xors_per_run) \
1821 struct { \
1822         void (*f)(unsigned long **); \
1823 } static xor_funcs ## xors_per_run[] = { \
1824         { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1825         { NULL }, \
1826         { _xor2_ ## xors_per_run }, \
1827         { _xor3_ ## xors_per_run }, \
1828         { _xor4_ ## xors_per_run }, \
1829         { _xor5_ ## xors_per_run }, \
1830         { _xor6_ ## xors_per_run }, \
1831         { _xor7_ ## xors_per_run }, \
1832         { _xor8_ ## xors_per_run }, \
1833 }; \
1834 \
1835 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1836 { \
1837         /* Call respective function for amount of chunks. */ \
1838         xor_funcs ## xors_per_run[n].f(data); \
1839 }
1840
1841 /* Define xor_8() - xor_64 functions. */
1842 MAKE_XOR(8)
1843 MAKE_XOR(16)
1844 MAKE_XOR(32)
1845 MAKE_XOR(64)
1846 /*
1847  * END xor optimization macros.
1848  */
1849
1850 /* Maximum number of chunks, which can be xor'ed in one go. */
1851 #define XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
1852
1853 /* xor_blocks wrapper to allow for using that crypto library function. */
1854 static void xor_blocks_wrapper(unsigned n, unsigned long **data)
1855 {
1856         BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
1857         xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
1858 }
1859
1860 struct xor_func {
1861         xor_function_t f;
1862         const char *name;
1863 } static xor_funcs[] = {
1864         { xor_64,  "xor_64" },
1865         { xor_32,  "xor_32" },
1866         { xor_16,  "xor_16" },
1867         { xor_8,   "xor_8"  },
1868         { xor_blocks_wrapper, "xor_blocks" },
1869 };
1870
1871 /*
1872  * Check, if chunk has to be xored in/out:
1873  *
1874  * o if writes are queued
1875  * o if writes are merged
1876  * o if stripe is to be reconstructed
1877  * o if recovery stripe
1878  */
1879 static inline int chunk_must_xor(struct stripe_chunk *chunk)
1880 {
1881         if (ChunkUptodate(chunk)) {
1882                 BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
1883                        !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
1884
1885                 if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
1886                     !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
1887                         return 1;
1888
1889                 if (StripeReconstruct(chunk->stripe) ||
1890                     StripeRecover(chunk->stripe))
1891                         return 1;
1892         }
1893
1894         return 0;
1895 }
1896
1897 /*
1898  * Calculate crc.
1899  *
1900  * This indexes into the chunks of a stripe and their pages.
1901  *
1902  * All chunks will be xored into the indexed (@pi)
1903  * chunk in maximum groups of xor.chunks.
1904  *
1905  */
1906 static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
1907 {
1908         struct raid_set *rs = RS(stripe->sc);
1909         unsigned max_chunks = rs->xor.chunks, n = 1,
1910                  o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
1911                  p = rs->set.raid_devs;
1912         unsigned long **d = rs->data;
1913         xor_function_t xor_f = rs->xor.f->f;
1914
1915         BUG_ON(sector > stripe->io.size);
1916
1917         /* Address of parity page to xor into. */
1918         d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
1919
1920         while (p--) {
1921                 /* Preset pointers to data pages. */
1922                 if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
1923                         d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
1924
1925                 /* If max chunks -> xor. */
1926                 if (n == max_chunks) {
1927                         mutex_lock(&rs->io.xor_lock);
1928                         xor_f(n, d);
1929                         mutex_unlock(&rs->io.xor_lock);
1930                         n = 1;
1931                 }
1932         }
1933
1934         /* If chunks -> xor. */
1935         if (n > 1) {
1936                 mutex_lock(&rs->io.xor_lock);
1937                 xor_f(n, d);
1938                 mutex_unlock(&rs->io.xor_lock);
1939         }
1940 }
1941
1942 /* Common xor loop through all stripe page lists. */
1943 static void common_xor(struct stripe *stripe, sector_t count,
1944                        unsigned off, unsigned pi)
1945 {
1946         unsigned sector;
1947
1948         BUG_ON(!count);
1949         for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
1950                 xor(stripe, pi, sector);
1951
1952         /* Set parity page uptodate and clean. */
1953         chunk_set(CHUNK(stripe, pi), CLEAN);
1954         atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
1955 }
1956
1957 /*
1958  * Calculate parity sectors on intact stripes.
1959  *
1960  * Need to calculate raid address for recover stripe, because its
1961  * chunk sizes differs and is typically larger than io chunk size.
1962  */
1963 static void parity_xor(struct stripe *stripe)
1964 {
1965         struct raid_set *rs = RS(stripe->sc);
1966         int size_differs = stripe->io.size != rs->set.io_size;
1967         unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
1968                  xor_size = chunk_size > io_size ? io_size : chunk_size;
1969         sector_t off;
1970
1971         /* This can be the recover stripe with a larger io size. */
1972         for (off = 0; off < io_size; off += xor_size) {
1973                 /*
1974                  * Recover stripe is likely bigger than regular io
1975                  * ones and has no precalculated parity disk index ->
1976                  * need to calculate RAID address.
1977                  */
1978                 if (unlikely(size_differs)) {
1979                         struct raid_address addr;
1980
1981                         raid_address(rs, (stripe->key + off) *
1982                                          rs->set.data_devs, &addr);
1983                         stripe->idx.parity = addr.pi;
1984                         stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
1985                 }
1986
1987                 common_xor(stripe, xor_size, off, stripe->idx.parity);
1988                 chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
1989         }
1990 }
1991
1992 /* Reconstruct missing chunk. */
1993 static void stripe_reconstruct(struct stripe *stripe)
1994 {
1995         struct raid_set *rs = RS(stripe->sc);
1996         int p = rs->set.raid_devs, pr = stripe->idx.recover;
1997
1998         BUG_ON(pr < 0);
1999
2000         /* Check if all but the chunk to be reconstructed are uptodate. */
2001         while (p--)
2002                 BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
2003
2004         /* REMOVEME: statistics. */
2005         atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
2006                                                  S_RECONSTRUCT_DEV));
2007         /* Zero chunk to be reconstructed. */
2008         stripe_zero_chunk(stripe, pr);
2009         common_xor(stripe, stripe->io.size, 0, pr);
2010 }
2011
2012 /*
2013  * Recovery io throttling
2014  */
2015 /* Conditionally reset io counters. */
2016 static int recover_io_reset(struct raid_set *rs)
2017 {
2018         unsigned long j = jiffies;
2019
2020         /* Pay attention to jiffies overflows. */
2021         if (j > rs->recover.last_jiffies + HZ ||
2022             j < rs->recover.last_jiffies) {
2023                 atomic_set(rs->recover.io_count + IO_WORK, 0);
2024                 atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2025                 rs->recover.last_jiffies = j;
2026                 return 1;
2027         }
2028
2029         return 0;
2030 }
2031
2032 /* Count ios. */
2033 static void recover_io_count(struct stripe *stripe)
2034 {
2035         struct raid_set *rs = RS(stripe->sc);
2036
2037         atomic_inc(rs->recover.io_count +
2038                    (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2039 }
2040
2041 /* Try getting a stripe either from the hash or from the LRU list. */
2042 static struct stripe *stripe_find(struct raid_set *rs,
2043                                   struct raid_address *addr)
2044 {
2045         int r;
2046         struct stripe_cache *sc = &rs->sc;
2047         struct stripe *stripe;
2048
2049         /* Try stripe from hash. */
2050         stripe = stripe_lookup(sc, addr->key);
2051         if (stripe) {
2052                 r = stripe_get(stripe);
2053                 if (r)
2054                         goto get_lock_failed;
2055
2056                 atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2057         } else {
2058                 /* Not in hash -> try to get an LRU stripe. */
2059                 stripe = stripe_lru_pop(sc);
2060                 if (stripe) {
2061                         /*
2062                          * An LRU stripe may not be referenced
2063                          * and may never have ios pending!
2064                          */
2065                         BUG_ON(stripe_ref(stripe));
2066                         BUG_ON(stripe_io_ref(stripe));
2067
2068                         /* Remove from hash if on before reuse. */
2069                         stripe_hash_del(stripe);
2070
2071                         /* Invalidate before reinserting with changed key. */
2072                         stripe_invalidate(stripe);
2073
2074                         stripe->key = addr->key;
2075                         stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2076                                                                 addr->key);
2077                         stripe->idx.parity = addr->pi;
2078                         r = stripe_get(stripe);
2079                         if (r)
2080                                 goto get_lock_failed;
2081
2082                         /* Insert stripe into the stripe hash. */
2083                         stripe_insert(&sc->hash, stripe);
2084                         /* REMOVEME: statistics. */
2085                         atomic_inc(rs->stats + S_INSCACHE);
2086                 }
2087         }
2088
2089         return stripe;
2090
2091 get_lock_failed:
2092         stripe_put(stripe);
2093         return NULL;
2094 }
2095
2096 /*
2097  * Process end io
2098  *
2099  * I need to do it here because I can't in interrupt
2100  */
2101 /* End io all bios on a bio list. */
2102 static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
2103                            int p, int error)
2104 {
2105         struct raid_set *rs = RS(stripe->sc);
2106         struct bio *bio;
2107         struct page_list *pl = PL(stripe, p);
2108         struct stripe_chunk *chunk = CHUNK(stripe, p);
2109
2110         /* Update region counters. */
2111         while ((bio = bio_list_pop(bl))) {
2112                 if (bio_data_dir(bio) == WRITE)
2113                         /* Drop io pending count for any writes. */
2114                         dm_rh_dec(rs->recover.rh, stripe->region);
2115                 else if (!error)
2116                         /* Copy data accross. */
2117                         bio_copy_page_list(READ, stripe, pl, bio);
2118
2119                 bio_endio(bio, error);
2120
2121                 /* REMOVEME: statistics. */
2122                 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
2123                            S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
2124
2125                 chunk_put(chunk);
2126                 stripe_put(stripe);
2127                 io_put(rs);     /* Wake any suspend waiters on last bio. */
2128         }
2129 }
2130
2131 /*
2132  * End io all reads/writes on a stripe copying
2133  * read data accross from stripe to bios and
2134  * decrementing region counters for writes.
2135  *
2136  * Processing of ios depeding on state:
2137  * o no chunk error -> endio ok
2138  * o degraded:
2139  *   - chunk error and read -> ignore to be requeued
2140  *   - chunk error and write -> endio ok
2141  * o dead (more than parity_devs failed) and chunk_error-> endio failed
2142  */
2143 static void stripe_endio(int rw, struct stripe *stripe)
2144 {
2145         struct raid_set *rs = RS(stripe->sc);
2146         unsigned p = rs->set.raid_devs;
2147         int write = (rw != READ);
2148
2149         while (p--) {
2150                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2151                 struct bio_list *bl;
2152
2153                 BUG_ON(ChunkLocked(chunk));
2154
2155                 bl = BL_CHUNK(chunk, rw);
2156                 if (bio_list_empty(bl))
2157                         continue;
2158
2159                 if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
2160                         /* RAID set dead. */
2161                         if (unlikely(RSDead(rs)))
2162                                 bio_list_endio(stripe, bl, p, -EIO);
2163                         /* RAID set degraded. */
2164                         else if (write)
2165                                 bio_list_endio(stripe, bl, p, 0);
2166                 } else {
2167                         BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
2168                         bio_list_endio(stripe, bl, p, 0);
2169                 }
2170         }
2171 }
2172
2173 /* Fail all ios hanging off all bio lists of a stripe. */
2174 static void stripe_fail_io(struct stripe *stripe)
2175 {
2176         struct raid_set *rs = RS(stripe->sc);
2177         unsigned p = rs->set.raid_devs;
2178
2179         while (p--) {
2180                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2181                 int i = ARRAY_SIZE(chunk->bl);
2182
2183                 /* Fail all bios on all bio lists of the stripe. */
2184                 while (i--) {
2185                         struct bio_list *bl = chunk->bl + i;
2186
2187                         if (!bio_list_empty(bl))
2188                                 bio_list_endio(stripe, bl, p, -EIO);
2189                 }
2190         }
2191
2192         /* Put stripe on LRU list. */
2193         BUG_ON(stripe_io_ref(stripe));
2194         BUG_ON(stripe_ref(stripe));
2195 }
2196
2197 /* Unlock all required chunks. */
2198 static void stripe_chunks_unlock(struct stripe *stripe)
2199 {
2200         unsigned p = RS(stripe->sc)->set.raid_devs;
2201         struct stripe_chunk *chunk;
2202
2203         while (p--) {
2204                 chunk = CHUNK(stripe, p);
2205
2206                 if (TestClearChunkUnlock(chunk))
2207                         ClearChunkLocked(chunk);
2208         }
2209 }
2210
2211 /*
2212  * Queue reads and writes to a stripe by hanging
2213  * their bios off the stripesets read/write lists.
2214  */
2215 static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
2216                             struct bio_list *reject)
2217 {
2218         struct raid_address addr;
2219         struct stripe *stripe;
2220
2221         stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
2222         if (stripe) {
2223                 int r = 0, rw = bio_data_dir(bio);
2224
2225                 /* Distinguish reads and writes. */
2226                 bio_list_add(BL(stripe, addr.di, rw), bio);
2227
2228                 if (rw == READ)
2229                         /* REMOVEME: statistics. */
2230                         atomic_inc(rs->stats + S_BIOS_ADDED_READ);
2231                 else {
2232                         /* Inrement pending write count on region. */
2233                         dm_rh_inc(rs->recover.rh, stripe->region);
2234                         r = 1;
2235
2236                         /* REMOVEME: statistics. */
2237                         atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
2238                 }
2239
2240                 /*
2241                  * Put on io (flush) list in case of
2242                  * initial bio queued to chunk.
2243                  */
2244                 if (chunk_get(CHUNK(stripe, addr.di)) == 1)
2245                         stripe_flush_add(stripe);
2246
2247                 return r;
2248         }
2249
2250         /* Got no stripe from cache or failed to lock it -> reject bio. */
2251         bio_list_add(reject, bio);
2252         atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
2253         return 0;
2254 }
2255
2256 /*
2257  * Handle all stripes by handing them to the daemon, because we can't
2258  * map their chunk pages to copy the data in interrupt context.
2259  *
2260  * We don't want to handle them here either, while interrupts are disabled.
2261  */
2262
2263 /* Read/write endio function for dm-io (interrupt context). */
2264 static void endio(unsigned long error, void *context)
2265 {
2266         struct stripe_chunk *chunk = context;
2267
2268         if (unlikely(error)) {
2269                 chunk_set(chunk, ERROR);
2270                 /* REMOVEME: statistics. */
2271                 atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
2272         } else
2273                 chunk_set(chunk, CLEAN);
2274
2275         /*
2276          * For recovery stripes, I need to reset locked locked
2277          * here, because those aren't processed in do_endios().
2278          */
2279         if (unlikely(StripeRecover(chunk->stripe)))
2280                 ClearChunkLocked(chunk);
2281         else
2282                 SetChunkUnlock(chunk);
2283
2284         /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2285         stripe_put_references(chunk->stripe);
2286 }
2287
2288 /* Read/Write a chunk asynchronously. */
2289 static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
2290 {
2291         struct stripe_cache *sc = stripe->sc;
2292         struct raid_set *rs = RS(sc);
2293         struct dm_mem_cache_object *obj = stripe->obj + p;
2294         struct page_list *pl = obj->pl;
2295         struct stripe_chunk *chunk = CHUNK(stripe, p);
2296         struct raid_dev *dev = rs->dev + p;
2297         struct dm_io_region io = {
2298                 .bdev = dev->dev->bdev,
2299                 .sector = stripe->key,
2300                 .count = stripe->io.size,
2301         };
2302         struct dm_io_request control = {
2303                 .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
2304                 .mem = {
2305                         .type = DM_IO_PAGE_LIST,
2306                         .ptr.pl = pl,
2307                         .offset = 0,
2308                 },
2309                 .notify = {
2310                         .fn = endio,
2311                         .context = chunk,
2312                 },
2313                 .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
2314                                                   sc->dm_io_client,
2315         };
2316
2317         BUG_ON(ChunkLocked(chunk));
2318         BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
2319         BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
2320
2321         /*
2322          * Don't rw past end of device, which can happen, because
2323          * typically sectors_per_dev isn't divisible by io_size.
2324          */
2325         if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2326                 io.count = rs->set.sectors_per_dev - io.sector;
2327
2328         BUG_ON(!io.count);
2329         io.sector += dev->start;        /* Add <offset>. */
2330         if (RSRecover(rs))
2331                 recover_io_count(stripe);       /* Recovery io accounting. */
2332
2333         /* REMOVEME: statistics. */
2334         atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
2335                                                     S_DM_IO_READ));
2336         SetChunkLocked(chunk);
2337         SetDevIoQueued(dev);
2338         BUG_ON(dm_io(&control, 1, &io, NULL));
2339 }
2340
2341 /*
2342  * Write dirty or read not uptodate page lists of a stripe.
2343  */
2344 static int stripe_chunks_rw(struct stripe *stripe)
2345 {
2346         int r;
2347         struct raid_set *rs = RS(stripe->sc);
2348
2349         /*
2350          * Increment the pending count on the stripe
2351          * first, so that we don't race in endio().
2352          *
2353          * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2354          *
2355          * o not uptodate
2356          * o dirtied by writes merged
2357          * o dirtied by parity calculations
2358          */
2359         r = for_each_io_dev(stripe, stripe_get_references);
2360         if (r) {
2361                 /* Io needed: chunks are either not uptodate or dirty. */
2362                 int max;        /* REMOVEME: */
2363                 struct stripe_cache *sc = &rs->sc;
2364
2365                 /* Submit actual io. */
2366                 for_each_io_dev(stripe, stripe_chunk_rw);
2367
2368                 /* REMOVEME: statistics */
2369                 max = sc_active(sc);
2370                 if (atomic_read(&sc->active_stripes_max) < max)
2371                         atomic_set(&sc->active_stripes_max, max);
2372
2373                 atomic_inc(rs->stats + S_FLUSHS);
2374                 /* END REMOVEME: statistics */
2375         }
2376
2377         return r;
2378 }
2379
2380 /* Merge in all writes hence dirtying respective chunks. */
2381 static void stripe_merge_writes(struct stripe *stripe)
2382 {
2383         unsigned p = RS(stripe->sc)->set.raid_devs;
2384
2385         while (p--) {
2386                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2387                 struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
2388
2389                 if (!bio_list_empty(write)) {
2390                         struct bio *bio;
2391                         struct page_list *pl = stripe->obj[p].pl;
2392
2393                         /*
2394                          * We can play with the lists without holding a lock,
2395                          * because it is just us accessing them anyway.
2396                          */
2397                         bio_list_for_each(bio, write)
2398                                 bio_copy_page_list(WRITE, stripe, pl, bio);
2399
2400                         bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
2401                         bio_list_init(write);
2402                         chunk_set(chunk, DIRTY);
2403                 }
2404         }
2405 }
2406
2407 /* Queue all writes to get merged. */
2408 static int stripe_queue_writes(struct stripe *stripe)
2409 {
2410         int r = 0;
2411         unsigned p = RS(stripe->sc)->set.raid_devs;
2412
2413         while (p--) {
2414                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2415                 struct bio_list *write = BL_CHUNK(chunk, WRITE);
2416
2417                 if (!bio_list_empty(write)) {
2418                         bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
2419                         bio_list_init(write);
2420 SetChunkIo(chunk);
2421                         r = 1;
2422                 }
2423         }
2424
2425         return r;
2426 }
2427
2428
2429 /* Check, if a chunk gets completely overwritten. */
2430 static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
2431 {
2432         unsigned sectors = 0;
2433         struct bio *bio;
2434         struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
2435
2436         bio_list_for_each(bio, bl)
2437                 sectors += bio_sectors(bio);
2438
2439         BUG_ON(sectors > RS(stripe->sc)->set.io_size);
2440         return sectors == RS(stripe->sc)->set.io_size;
2441 }
2442
2443 /*
2444  * Avoid io on broken/reconstructed drive in order to
2445  * reconstruct date on endio.
2446  *
2447  * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2448  *       will trigger a reconstruct call before resetting it.
2449  */
2450 static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
2451 {
2452         struct stripe_chunk *chunk = CHUNK(stripe, pr);
2453
2454         /*
2455          * Allow io on all chunks but the indexed one,
2456          * because we're either degraded or prohibit it
2457          * on the one for later reconstruction.
2458          */
2459         /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2460         stripe_chunk_invalidate(chunk);
2461         stripe->idx.recover = pr;
2462         SetStripeReconstruct(stripe);
2463
2464         /* REMOVEME: statistics. */
2465         atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2466         return -EPERM;
2467 }
2468
2469 /* Chunk locked/uptodate and device failed tests. */
2470 static struct stripe_chunk *
2471 stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
2472 {
2473         struct raid_set *rs = RS(stripe->sc);
2474         struct stripe_chunk *chunk = CHUNK(stripe, p);
2475
2476         /* Can't access active chunks. */
2477         if (ChunkLocked(chunk)) {
2478                 /* REMOVEME: statistics. */
2479                 atomic_inc(rs->stats + S_CHUNK_LOCKED);
2480                 return NULL;
2481         }
2482
2483         /* Can't access broken devive. */
2484         if (ChunkError(chunk) || DevFailed(rs->dev + p))
2485                 return NULL;
2486
2487         /* Can access uptodate chunks. */
2488         if (ChunkUptodate(chunk)) {
2489                 (*chunks_uptodate)++;
2490                 return NULL;
2491         }
2492
2493         return chunk;
2494 }
2495
2496 /*
2497  * Degraded/reconstruction mode.
2498  *
2499  * Check stripe state to figure which chunks don't need IO.
2500  *
2501  * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2502  */
2503 static int stripe_check_reconstruct(struct stripe *stripe)
2504 {
2505         struct raid_set *rs = RS(stripe->sc);
2506
2507         if (RSDead(rs)) {
2508                 ClearStripeReconstruct(stripe);
2509                 ClearStripeReconstructed(stripe);
2510                 stripe_allow_io(stripe);
2511                 return 0;
2512         }
2513
2514         /* Avoid further reconstruction setting, when already set. */
2515         if (StripeReconstruct(stripe)) {
2516                 /* REMOVEME: statistics. */
2517                 atomic_inc(rs->stats + S_RECONSTRUCT_SET);
2518                 return -EBUSY;
2519         }
2520
2521         /* Initially allow io on all chunks. */
2522         stripe_allow_io(stripe);
2523
2524         /* Return if stripe is already reconstructed. */
2525         if (StripeReconstructed(stripe)) {
2526                 atomic_inc(rs->stats + S_RECONSTRUCTED);
2527                 return 0;
2528         }
2529
2530         /*
2531          * Degraded/reconstruction mode (device failed) ->
2532          * avoid io on the failed device.
2533          */
2534         if (unlikely(RSDegraded(rs))) {
2535                 /* REMOVEME: statistics. */
2536                 atomic_inc(rs->stats + S_DEGRADED);
2537                 /* Allow IO on all devices but the dead one. */
2538                 BUG_ON(rs->set.ei < 0);
2539                 return stripe_chunk_set_io_flags(stripe, rs->set.ei);
2540         } else {
2541                 int sync, pi = dev_for_parity(stripe, &sync);
2542
2543                 /*
2544                  * Reconstruction mode (ie. a particular (replaced) device or
2545                  * some (rotating) parity chunk is being resynchronized) ->
2546                  *   o make sure all needed chunks are read in
2547                  *   o cope with 3/4 disk array special case where it
2548                  *     doesn't make a difference to read in parity
2549                  *     to xor data in/out
2550                  */
2551                 if (RSEnforceParityCreation(rs) || !sync) {
2552                         /* REMOVEME: statistics. */
2553                         atomic_inc(rs->stats + S_NOSYNC);
2554                         /* Allow IO on all devs but the one to reconstruct. */
2555                         return stripe_chunk_set_io_flags(stripe, pi);
2556                 }
2557         }
2558
2559         return 0;
2560 }
2561
2562 /*
2563  * Check, if stripe is ready to merge writes.
2564  * I.e. if all chunks present to allow to merge bios.
2565  *
2566  * We prohibit io on:
2567  *
2568  * o chunks without bios
2569  * o chunks which get completely written over
2570  */
2571 static int stripe_merge_possible(struct stripe *stripe, int nosync)
2572 {
2573         struct raid_set *rs = RS(stripe->sc);
2574         unsigned chunks_overwrite = 0, chunks_prohibited = 0,
2575                  chunks_uptodate = 0, p = rs->set.raid_devs;
2576
2577         /* Walk all chunks. */
2578         while (p--) {
2579                 struct stripe_chunk *chunk;
2580
2581                 /* Prohibit io on broken devices. */
2582                 if (DevFailed(rs->dev + p)) {
2583                         chunk = CHUNK(stripe, p);
2584                         goto prohibit_io;
2585                 }
2586
2587                 /* We can't optimize any further if no chunk. */
2588                 chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
2589                 if (!chunk || nosync)
2590                         continue;
2591
2592                 /*
2593                  * We have a chunk, which is not uptodate.
2594                  *
2595                  * If this is not parity and we don't have
2596                  * reads queued, we can optimize further.
2597                  */
2598                 if (p != stripe->idx.parity &&
2599                     bio_list_empty(BL_CHUNK(chunk, READ)) &&
2600                     bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
2601                         if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
2602                                 goto prohibit_io;
2603                         else if (RSCheckOverwrite(rs) &&
2604                                  stripe_check_chunk_overwrite(stripe, p))
2605                                 /* Completely overwritten chunk. */
2606                                 chunks_overwrite++;
2607                 }
2608
2609                 /* Allow io for chunks with bios and overwritten ones. */
2610                 SetChunkIo(chunk);
2611                 continue;
2612
2613 prohibit_io:
2614                 /* No io for broken devices or for chunks w/o bios. */
2615                 ClearChunkIo(chunk);
2616                 chunks_prohibited++;
2617                 /* REMOVEME: statistics. */
2618                 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2619         }
2620
2621         /* All data chunks will get written over. */
2622         if (chunks_overwrite == rs->set.data_devs)
2623                 atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
2624         else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
2625                 /* We don't have enough chunks to merge. */
2626                 atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
2627                 return -EPERM;
2628         }
2629
2630         /*
2631          * If we have all chunks up to date or overwrite them, we
2632          * just zero the parity chunk and let stripe_rw() recreate it.
2633          */
2634         if (chunks_uptodate == rs->set.raid_devs ||
2635             chunks_overwrite == rs->set.data_devs) {
2636                 stripe_zero_chunk(stripe, stripe->idx.parity);
2637                 BUG_ON(StripeReconstruct(stripe));
2638                 SetStripeReconstruct(stripe);   /* Enforce xor in caller. */
2639         } else {
2640                 /*
2641                  * With less chunks, we xor parity out.
2642                  *
2643                  * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2644                  *       so that only chunks with queued or merged writes
2645                  *       are being xored.
2646                  */
2647                 parity_xor(stripe);
2648         }
2649
2650         /*
2651          * We do have enough chunks to merge.
2652          * All chunks are uptodate or get written over.
2653          */
2654         atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
2655         return 0;
2656 }
2657
2658 /*
2659  * Avoid reading chunks in case we're fully operational.
2660  *
2661  * We prohibit io on any chunks without bios but the parity chunk.
2662  */
2663 static void stripe_avoid_reads(struct stripe *stripe)
2664 {
2665         struct raid_set *rs = RS(stripe->sc);
2666         unsigned dummy = 0, p = rs->set.raid_devs;
2667
2668         /* Walk all chunks. */
2669         while (p--) {
2670                 struct stripe_chunk *chunk =
2671                         stripe_chunk_check(stripe, p, &dummy);
2672
2673                 if (!chunk)
2674                         continue;
2675
2676                 /* If parity or any bios pending -> allow io. */
2677                 if (chunk_ref(chunk) || p == stripe->idx.parity)
2678                         SetChunkIo(chunk);
2679                 else {
2680                         ClearChunkIo(chunk);
2681                         /* REMOVEME: statistics. */
2682                         atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2683                 }
2684         }
2685 }
2686
2687 /*
2688  * Read/write a stripe.
2689  *
2690  * All stripe read/write activity goes through this function
2691  * unless recovery, which has to call stripe_chunk_rw() directly.
2692  *
2693  * Make sure we don't try already merged stripes in order
2694  * to avoid data corruption.
2695  *
2696  * Check the state of the RAID set and if degraded (or
2697  * resynchronizing for reads), read in all other chunks but
2698  * the one on the dead/resynchronizing device in order to be
2699  * able to reconstruct the missing one in _do_endios().
2700  *
2701  * Can be called on active stripes in order
2702  * to dispatch new io on inactive chunks.
2703  *
2704  * States to cover:
2705  *   o stripe to read and/or write
2706  *   o stripe with error to reconstruct
2707  */
2708 static int stripe_rw(struct stripe *stripe)
2709 {
2710         int nosync, r;
2711         struct raid_set *rs = RS(stripe->sc);
2712
2713         /*
2714          * Check, if a chunk needs to be reconstructed
2715          * because of a degraded set or a region out of sync.
2716          */
2717         nosync = stripe_check_reconstruct(stripe);
2718         switch (nosync) {
2719         case -EBUSY:
2720                 return 0; /* Wait for stripe reconstruction to finish. */
2721         case -EPERM:
2722                 goto io;
2723         }
2724
2725         /*
2726          * If we don't have merged writes pending, we can schedule
2727          * queued writes to be merged next without corrupting data.
2728          */
2729         if (!StripeMerged(stripe)) {
2730                 r = stripe_queue_writes(stripe);
2731                 if (r)
2732                         /* Writes got queued -> flag RBW. */
2733                         SetStripeRBW(stripe);
2734         }
2735
2736         /*
2737          * Merge all writes hanging off uptodate/overwritten
2738          * chunks of the stripe.
2739          */
2740         if (StripeRBW(stripe)) {
2741                 r = stripe_merge_possible(stripe, nosync);
2742                 if (!r) { /* Merge possible. */
2743                         struct stripe_chunk *chunk;
2744
2745                         /*
2746                          * I rely on valid parity in order
2747                          * to xor a fraction of chunks out
2748                          * of parity and back in.
2749                          */
2750                         stripe_merge_writes(stripe);    /* Merge writes in. */
2751                         parity_xor(stripe);             /* Update parity. */
2752                         ClearStripeReconstruct(stripe); /* Reset xor enforce. */
2753                         SetStripeMerged(stripe);        /* Writes merged. */
2754                         ClearStripeRBW(stripe);         /* Disable RBW. */
2755
2756                         /*
2757                          * REMOVEME: sanity check on parity chunk
2758                          *           states after writes got merged.
2759                          */
2760                         chunk = CHUNK(stripe, stripe->idx.parity);
2761                         BUG_ON(ChunkLocked(chunk));
2762                         BUG_ON(!ChunkUptodate(chunk));
2763                         BUG_ON(!ChunkDirty(chunk));
2764                         BUG_ON(!ChunkIo(chunk));
2765                 }
2766         } else if (!nosync && !StripeMerged(stripe))
2767                 /* Read avoidance if not degraded/resynchronizing/merged. */
2768                 stripe_avoid_reads(stripe);
2769
2770 io:
2771         /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2772         r = stripe_chunks_rw(stripe);
2773         if (!r) {
2774                 /*
2775                  * No io submitted because of chunk io
2776                  * prohibited or locked chunks/failed devices
2777                  * -> push to end io list for processing.
2778                  */
2779                 stripe_endio_push(stripe);
2780                 atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
2781         }
2782
2783         return r;
2784 }
2785
2786 /*
2787  * Recovery functions
2788  */
2789 /* Read a stripe off a raid set for recovery. */
2790 static int stripe_recover_read(struct stripe *stripe, int pi)
2791 {
2792         BUG_ON(stripe_io_ref(stripe));
2793
2794         /* Invalidate all chunks so that they get read in. */
2795         stripe_chunks_invalidate(stripe);
2796         stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
2797
2798         /*
2799          * If we are reconstructing a perticular device, we can avoid
2800          * reading the respective chunk in, because we're going to
2801          * reconstruct it anyway.
2802          *
2803          * We can't do that for resynchronization of rotating parity,
2804          * because the recovery stripe chunk size is typically larger
2805          * than the sets chunk size.
2806          */
2807         if (pi > -1)
2808                 ClearChunkIo(CHUNK(stripe, pi));
2809
2810         return stripe_chunks_rw(stripe);
2811 }
2812
2813 /* Write a stripe to a raid set for recovery. */
2814 static int stripe_recover_write(struct stripe *stripe, int pi)
2815 {
2816         BUG_ON(stripe_io_ref(stripe));
2817
2818         /*
2819          * If this is a reconstruct of a particular device, then
2820          * reconstruct the respective chunk, else create parity chunk.
2821          */
2822         if (pi > -1) {
2823                 stripe_zero_chunk(stripe, pi);
2824                 common_xor(stripe, stripe->io.size, 0, pi);
2825                 chunk_set(CHUNK(stripe, pi), DIRTY);
2826         } else
2827                 parity_xor(stripe);
2828
2829         return stripe_chunks_rw(stripe);
2830 }
2831
2832 /* Read/write a recovery stripe. */
2833 static int stripe_recover_rw(struct stripe *stripe)
2834 {
2835         int r = 0, sync = 0;
2836
2837         /* Read/write flip-flop. */
2838         if (TestClearStripeRBW(stripe)) {
2839                 SetStripeMerged(stripe);
2840                 stripe->key = stripe->recover->pos;
2841                 r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
2842                 BUG_ON(!r);
2843         } else if (TestClearStripeMerged(stripe)) {
2844                 r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
2845                 BUG_ON(!r);
2846         }
2847
2848         BUG_ON(sync);
2849         return r;
2850 }
2851
2852 /* Recover bandwidth available ?. */
2853 static int recover_bandwidth(struct raid_set *rs)
2854 {
2855         int r, work;
2856
2857         /* On reset or when bios delayed -> allow recovery. */
2858         r = recover_io_reset(rs);
2859         if (r || RSBandwidth(rs))
2860                 goto out;
2861
2862         work = atomic_read(rs->recover.io_count + IO_WORK);
2863         if (work) {
2864                 /* Pay attention to larger recover stripe size. */
2865                 int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
2866                                           rs->recover.io_size / rs->set.io_size;
2867
2868                 /*
2869                  * Don't use more than given bandwidth
2870                  * of the work io for recovery.
2871                  */
2872                 if (recover > work / rs->recover.bandwidth_work) {
2873                         /* REMOVEME: statistics. */
2874                         atomic_inc(rs->stats + S_NO_BANDWIDTH);
2875                         return 0;
2876                 }
2877         }
2878
2879 out:
2880         atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
2881         return 1;
2882 }
2883
2884 /* Try to get a region to recover. */
2885 static int stripe_recover_get_region(struct stripe *stripe)
2886 {
2887         struct raid_set *rs = RS(stripe->sc);
2888         struct recover *rec = &rs->recover;
2889         struct recover_addr *addr = stripe->recover;
2890         struct dm_dirty_log *dl = rec->dl;
2891         struct dm_rh_client *rh = rec->rh;
2892
2893         BUG_ON(!dl);
2894         BUG_ON(!rh);
2895
2896         /* Return, that we have region first to finish it during suspension. */
2897         if (addr->reg)
2898                 return 1;
2899
2900         if (RSSuspend(rs))
2901                 return -EPERM;
2902
2903         if (dl->type->get_sync_count(dl) >= rec->nr_regions)
2904                 return -ENOENT;
2905
2906         /* If we don't have enough bandwidth, we don't proceed recovering. */
2907         if (!recover_bandwidth(rs))
2908                 return -EAGAIN;
2909
2910         /* Start quiescing a region. */
2911         dm_rh_recovery_prepare(rh);
2912         addr->reg = dm_rh_recovery_start(rh);
2913         if (!addr->reg)
2914                 return -EAGAIN;
2915
2916         addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
2917         addr->end = addr->pos + dm_rh_get_region_size(rh);
2918
2919         /*
2920          * Take one global io reference out for the
2921          * whole region, which is going to be released
2922          * when the region is completely done with.
2923          */
2924         io_get(rs);
2925         return 0;
2926 }
2927
2928 /* Update region hash state. */
2929 enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
2930 static void recover_rh_update(struct stripe *stripe, enum recover_type success)
2931 {
2932         struct recover_addr *addr = stripe->recover;
2933         struct raid_set *rs = RS(stripe->sc);
2934         struct recover *rec = &rs->recover;
2935
2936         if (!addr->reg) {
2937                 DMERR("%s- Called w/o region", __func__);
2938                 return;
2939         }
2940
2941         dm_rh_recovery_end(addr->reg, success);
2942         if (success)
2943                 rec->nr_regions_recovered++;
2944
2945         addr->reg = NULL;
2946
2947         /*
2948          * Completely done with this region ->
2949          * release the 1st io reference.
2950          */
2951         io_put(rs);
2952 }
2953
2954 /* Set start of recovery state. */
2955 static void set_start_recovery(struct raid_set *rs)
2956 {
2957         /* Initialize recovery. */
2958         rs->recover.start_jiffies = jiffies;
2959         rs->recover.end_jiffies = 0;
2960 }
2961
2962 /* Set end of recovery state. */
2963 static void set_end_recovery(struct raid_set *rs)
2964 {
2965         ClearRSRecover(rs);
2966 /* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
2967         rs->set.dev_to_init = -1;
2968
2969         /* Check for jiffies overrun. */
2970         rs->recover.end_jiffies = jiffies;
2971         if (rs->recover.end_jiffies < rs->recover.start_jiffies)
2972                 rs->recover.end_jiffies = ~0;
2973 }
2974
2975 /* Handle recovery on one recovery stripe. */
2976 static int _do_recovery(struct stripe *stripe)
2977 {
2978         int r;
2979         struct raid_set *rs = RS(stripe->sc);
2980         struct recover_addr *addr = stripe->recover;
2981
2982         /* If recovery is active -> return. */
2983         if (stripe_io_ref(stripe))
2984                 return 1;
2985
2986         /* IO error is fatal for recovery -> stop it. */
2987         if (unlikely(StripeError(stripe)))
2988                 goto err;
2989
2990         /* Recovery end required. */
2991         if (unlikely(RSDegraded(rs)))
2992                 goto err;
2993
2994         /* Get a region to recover. */
2995         r = stripe_recover_get_region(stripe);
2996         switch (r) {
2997         case 0: /* Got a new region: flag initial read before write. */
2998                 SetStripeRBW(stripe);
2999         case 1: /* Have a region in the works. */
3000                 break;
3001         case -EAGAIN:
3002                 /* No bandwidth/quiesced region yet, try later. */
3003                 if (!io_ref(rs))
3004                         wake_do_raid_delayed(rs, HZ / 4);
3005         case -EPERM:
3006                 /* Suspend. */
3007                 return 1;
3008         case -ENOENT:   /* No more regions to recover. */
3009                 schedule_work(&rs->io.ws_do_table_event);
3010                 return 0;
3011         default:
3012                 BUG();
3013         }
3014
3015         /* Read/write a recover stripe. */
3016         r = stripe_recover_rw(stripe);
3017         if (r)
3018                 /* IO initiated. */
3019                 return 1;
3020
3021         /* Read and write finished-> update recovery position within region. */
3022         addr->pos += stripe->io.size;
3023
3024         /* If we're at end of region, update region hash. */
3025         if (addr->pos >= addr->end ||
3026             addr->pos >= rs->set.sectors_per_dev)
3027                 recover_rh_update(stripe, REC_SUCCESS);
3028         else
3029                 /* Prepare to read next region segment. */
3030                 SetStripeRBW(stripe);
3031
3032         /* Schedule myself for another round... */
3033         wake_do_raid(rs);
3034         return 1;
3035
3036 err:
3037         /* FIXME: rather try recovering other regions on error? */
3038         rs_check_degrade(stripe);
3039         recover_rh_update(stripe, REC_FAILURE);
3040
3041         /* Check state of partially recovered array. */
3042         if (RSDegraded(rs) && !RSDead(rs) &&
3043             rs->set.dev_to_init != -1 &&
3044             rs->set.ei != rs->set.dev_to_init) {
3045                 /* Broken drive != drive to recover -> FATAL. */
3046                 SetRSDead(rs);
3047                 DMERR("FATAL: failed device != device to initialize -> "
3048                       "RAID set broken");
3049         }
3050
3051         if (StripeError(stripe) || RSDegraded(rs)) {
3052                 char buf[BDEVNAME_SIZE];
3053
3054                 DMERR("stopping recovery due to "
3055                       "ERROR on /dev/%s, stripe at offset %llu",
3056                       bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3057                       (unsigned long long) stripe->key);
3058
3059         }
3060
3061         /* Make sure, that all quiesced regions get released. */
3062         while (addr->reg) {
3063                 dm_rh_recovery_end(addr->reg, -EIO);
3064                 addr->reg = dm_rh_recovery_start(rs->recover.rh);
3065         }
3066
3067         return 0;
3068 }
3069
3070 /* Called by main io daemon to recover regions. */
3071 static int do_recovery(struct raid_set *rs)
3072 {
3073         if (RSRecover(rs)) {
3074                 int r = 0;
3075                 struct stripe *stripe;
3076
3077                 list_for_each_entry(stripe, &rs->recover.stripes,
3078                                     lists[LIST_RECOVER])
3079                         r += _do_recovery(stripe);
3080
3081                 if (r)
3082                         return r;
3083
3084                 set_end_recovery(rs);
3085                 stripe_recover_free(rs);
3086         }
3087
3088         return 0;
3089 }
3090
3091 /*
3092  * END recovery functions
3093  */
3094
3095 /* End io process all stripes handed in by endio() callback. */
3096 static void _do_endios(struct raid_set *rs, struct stripe *stripe,
3097                        struct list_head *flush_list)
3098 {
3099         /* First unlock all required chunks. */
3100         stripe_chunks_unlock(stripe);
3101
3102         /*
3103          * If an io error on a stripe occured, degrade the RAID set
3104          * and try to endio as many bios as possible. If any bios can't
3105          * be endio processed, requeue the stripe (stripe_ref() != 0).
3106          */
3107         if (TestClearStripeError(stripe)) {
3108                 /*
3109                  * FIXME: if read, rewrite the failed chunk after reconstruction
3110                  *        in order to trigger disk bad sector relocation.
3111                  */
3112                 rs_check_degrade(stripe); /* Resets ChunkError(). */
3113                 ClearStripeReconstruct(stripe);
3114                 ClearStripeReconstructed(stripe);
3115
3116                 /*
3117                  * FIXME: if write, don't endio writes in flight and don't
3118                  *        allow for new writes until userspace has updated
3119                  *        its metadata.
3120                  */
3121         }
3122
3123         /* Got to reconstruct a missing chunk. */
3124         if (StripeReconstruct(stripe)) {
3125                 /*
3126                  * (*2*) We use StripeReconstruct() to allow for
3127                  *       all chunks to be xored into the reconstructed
3128                  *       one (see chunk_must_xor()).
3129                  */
3130                 stripe_reconstruct(stripe);
3131
3132                 /*
3133                  * (*3*) Now we reset StripeReconstruct() and flag
3134                  *       StripeReconstructed() to show to stripe_rw(),
3135                  *       that we have reconstructed a missing chunk.
3136                  */
3137                 ClearStripeReconstruct(stripe);
3138                 SetStripeReconstructed(stripe);
3139
3140                 /* FIXME: reschedule to be written in case of read. */
3141                 /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
3142                         chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
3143                         stripe_chunks_rw(stripe);
3144                 } */
3145
3146                 stripe->idx.recover = -1;
3147         }
3148
3149         /*
3150          * Now that we eventually got a complete stripe, we
3151          * can process the rest of the end ios on reads.
3152          */
3153         stripe_endio(READ, stripe);
3154
3155         /* End io all merged writes if not prohibited. */
3156         if (!RSProhibitWrites(rs) && StripeMerged(stripe)) {
3157                 ClearStripeMerged(stripe);
3158                 stripe_endio(WRITE_MERGED, stripe);
3159         }
3160
3161         /* If RAID set is dead -> fail any ios to dead drives. */
3162         if (RSDead(rs)) {
3163                 if (!TestSetRSDeadEndioMessage(rs))
3164                         DMERR("RAID set dead: failing ios to dead devices");
3165
3166                 stripe_fail_io(stripe);
3167         }
3168
3169         /*
3170          * We have stripe references still,
3171          * beacuse of read before writes or IO errors ->
3172          * got to put on flush list for processing.
3173          */
3174         if (stripe_ref(stripe)) {
3175                 BUG_ON(!list_empty(stripe->lists + LIST_LRU));
3176                 list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
3177                 atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
3178         } else
3179                 stripe_lru_add(stripe);
3180 }
3181
3182 /* Pop any endio stripes off of the endio list and belabour them. */
3183 static void do_endios(struct raid_set *rs)
3184 {
3185         struct stripe_cache *sc = &rs->sc;
3186         struct stripe *stripe;
3187         /* IO flush list for sorted requeued stripes. */
3188         struct list_head flush_list;
3189
3190         INIT_LIST_HEAD(&flush_list);
3191
3192         while ((stripe = stripe_endio_pop(sc))) {
3193                 /* Avoid endio on stripes with newly io'ed chunks. */
3194                 if (!stripe_io_ref(stripe))
3195                         _do_endios(rs, stripe, &flush_list);
3196         }
3197
3198         /*
3199          * Insert any requeued stripes in the proper
3200          * order at the beginning of the io (flush) list.
3201          */
3202         list_splice(&flush_list, sc->lists + LIST_FLUSH);
3203 }
3204
3205 /* Flush any stripes on the io list. */
3206 static int do_flush(struct raid_set *rs)
3207 {
3208         int r = 0;
3209         struct stripe *stripe;
3210
3211         while ((stripe = stripe_io_pop(&rs->sc)))
3212                 r += stripe_rw(stripe); /* Read/write stripe. */
3213
3214         return r;
3215 }
3216
3217 /* Stripe cache resizing. */
3218 static void do_sc_resize(struct raid_set *rs)
3219 {
3220         unsigned set = atomic_read(&rs->sc.stripes_to_set);
3221
3222         if (set) {
3223                 unsigned cur = atomic_read(&rs->sc.stripes);
3224                 int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
3225                                       sc_shrink(&rs->sc, cur - set);
3226
3227                 /* Flag end of resizeing if ok. */
3228                 if (!r)
3229                         atomic_set(&rs->sc.stripes_to_set, 0);
3230         }
3231 }
3232
3233 /*
3234  * Process all ios
3235  *
3236  * We do different things with the io depending
3237  * on the state of the region that it is in:
3238  *
3239  * o reads: hang off stripe cache or postpone if full
3240  *
3241  * o writes:
3242  *
3243  *  CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3244  *                      In case stripe cache is full or busy, postpone the io.
3245  *
3246  *  RECOVERING:         delay the io until recovery of the region completes.
3247  *
3248  */
3249 static void do_ios(struct raid_set *rs, struct bio_list *ios)
3250 {
3251         int r;
3252         unsigned flush = 0, delay = 0;
3253         sector_t sector;
3254         struct dm_rh_client *rh = rs->recover.rh;
3255         struct bio *bio;
3256         struct bio_list reject;
3257
3258         bio_list_init(&reject);
3259
3260         /*
3261          * Classify each io:
3262          *    o delay writes to recovering regions (let reads go through)
3263          *    o queue io to all other regions
3264          */
3265         while ((bio = bio_list_pop(ios))) {
3266                 /*
3267                  * In case we get a barrier bio, push it back onto
3268                  * the input queue unless all work queues are empty
3269                  * and the stripe cache is inactive.
3270                  */
3271                 if (bio->bi_rw & REQ_FLUSH) {
3272                         /* REMOVEME: statistics. */
3273                         atomic_inc(rs->stats + S_BARRIER);
3274                         if (delay ||
3275                             !list_empty(rs->sc.lists + LIST_FLUSH) ||
3276                             !bio_list_empty(&reject) ||
3277                             sc_active(&rs->sc)) {
3278                                 bio_list_push(ios, bio);
3279                                 break;
3280                         }
3281                 }
3282
3283                 /* If writes prohibited because of failures -> postpone. */
3284                 if (RSProhibitWrites(rs) && bio_data_dir(bio) == WRITE) {
3285                         bio_list_add(&reject, bio);
3286                         continue;
3287                 }
3288
3289                 /* Check for recovering regions. */
3290                 sector = _sector(rs, bio);
3291                 r = region_state(rs, sector, DM_RH_RECOVERING);
3292                 if (unlikely(r)) {
3293                         delay++;
3294                         /* Wait writing to recovering regions. */
3295                         dm_rh_delay_by_region(rh, bio,
3296                                               dm_rh_sector_to_region(rh,
3297                                                                      sector));
3298                         /* REMOVEME: statistics.*/
3299                         atomic_inc(rs->stats + S_DELAYED_BIOS);
3300                         atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3301
3302                         /* Force bandwidth tests in recovery. */
3303                         SetRSBandwidth(rs);
3304                 } else {
3305                         /*
3306                          * Process ios to non-recovering regions by queueing
3307                          * them to stripes (does dm_rh_inc()) for writes).
3308                          */
3309                         flush += stripe_queue_bio(rs, bio, &reject);
3310                 }
3311         }
3312
3313         if (flush) {
3314                 /* FIXME: better error handling. */
3315                 r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3316                 if (r)
3317                         DMERR_LIMIT("dirty log flush");
3318         }
3319
3320         /* Merge any rejected bios back to the head of the input list. */
3321         bio_list_merge_head(ios, &reject);
3322 }
3323
3324 /* Send an event in case we're getting too busy. */
3325 static void do_busy_event(struct raid_set *rs)
3326 {
3327         if (sc_busy(rs)) {
3328                 if (!TestSetRSScBusy(rs))
3329                         schedule_work(&rs->io.ws_do_table_event);
3330         } else
3331                 ClearRSScBusy(rs);
3332 }
3333
3334 /* Throw an event. */
3335 static void do_table_event(struct work_struct *ws)
3336 {
3337         struct raid_set *rs = container_of(ws, struct raid_set,
3338                                            io.ws_do_table_event);
3339         dm_table_event(rs->ti->table);
3340 }
3341
3342
3343 /*-----------------------------------------------------------------
3344  * RAID daemon
3345  *---------------------------------------------------------------*/
3346 /*
3347  * o belabour all end ios
3348  * o update the region hash states
3349  * o optionally shrink the stripe cache
3350  * o optionally do recovery
3351  * o unplug any component raid devices with queued bios
3352  * o grab the input queue
3353  * o work an all requeued or new ios and perform stripe cache flushs
3354  * o unplug any component raid devices with queued bios
3355  * o check, if the stripe cache gets too busy and throw an event if so
3356  */
3357 static void do_raid(struct work_struct *ws)
3358 {
3359         int r;
3360         struct raid_set *rs = container_of(ws, struct raid_set,
3361                                            io.dws_do_raid.work);
3362         struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3363
3364         /*
3365          * We always need to end io, so that ios can get errored in
3366          * case the set failed and the region counters get decremented
3367          * before we update region hash states and go any further.
3368          */
3369         do_endios(rs);
3370         dm_rh_update_states(rs->recover.rh, 1);
3371
3372         /*
3373          * Now that we've end io'd, which may have put stripes on the LRU list
3374          * to allow for shrinking, we resize the stripe cache if requested.
3375          */
3376         do_sc_resize(rs);
3377
3378         /* Try to recover regions. */
3379         r = do_recovery(rs);
3380
3381         /* Quickly grab all new ios queued and add them to the work list. */
3382         mutex_lock(&rs->io.in_lock);
3383         bio_list_merge(ios, ios_in);
3384         bio_list_init(ios_in);
3385         mutex_unlock(&rs->io.in_lock);
3386
3387         if (!bio_list_empty(ios))
3388                 do_ios(rs, ios); /* Got ios to work into the cache. */
3389
3390         r = do_flush(rs);               /* Flush any stripes on io list. */
3391
3392         do_busy_event(rs);      /* Check if we got too busy. */
3393 }
3394
3395 /*
3396  * Callback for region hash to dispatch
3397  * delayed bios queued to recovered regions
3398  * (gets called via dm_rh_update_states()).
3399  */
3400 static void dispatch_delayed_bios(void *context, struct bio_list *bl)
3401 {
3402         struct raid_set *rs = context;
3403         struct bio *bio;
3404
3405         /* REMOVEME: statistics; decrement pending delayed bios counter. */
3406         bio_list_for_each(bio, bl)
3407                 atomic_dec(rs->stats + S_DELAYED_BIOS);
3408
3409         /* Merge region hash private list to work list. */
3410         bio_list_merge_head(&rs->io.work, bl);
3411         bio_list_init(bl);
3412         ClearRSBandwidth(rs);
3413 }
3414
3415 /*************************************************************
3416  * Constructor helpers
3417  *************************************************************/
3418 /* Calculate MB/sec. */
3419 static unsigned mbpers(struct raid_set *rs, unsigned io_size)
3420 {
3421         return to_bytes((rs->xor.speed * rs->set.data_devs *
3422                          io_size * HZ / XOR_SPEED_TICKS) >> 10) >> 10;
3423 }
3424
3425 /*
3426  * Discover fastest xor algorithm and # of chunks combination.
3427  */
3428 /* Calculate speed of particular algorithm and # of chunks. */
3429 static unsigned xor_speed(struct stripe *stripe)
3430 {
3431         int ticks = XOR_SPEED_TICKS;
3432         unsigned p = RS(stripe->sc)->set.raid_devs, r = 0;
3433         unsigned long j;
3434
3435         /* Set uptodate so that common_xor()->xor() will belabour chunks. */
3436         while (p--)
3437                 SetChunkUptodate(CHUNK(stripe, p));
3438
3439         /* Wait for next tick. */
3440         for (j = jiffies; j == jiffies; );
3441
3442         /* Do xors for a few ticks. */
3443         while (ticks--) {
3444                 unsigned xors = 0;
3445
3446                 for (j = jiffies; j == jiffies; ) {
3447                         mb();
3448                         common_xor(stripe, stripe->io.size, 0, 0);
3449                         mb();
3450                         xors++;
3451                         mb();
3452                 }
3453
3454                 if (xors > r)
3455                         r = xors;
3456         }
3457
3458         return r;
3459 }
3460
3461 /* Define for xor multi recovery stripe optimization runs. */
3462 #define DMRAID45_XOR_TEST
3463
3464 /* Optimize xor algorithm for this RAID set. */
3465 static unsigned xor_optimize(struct raid_set *rs)
3466 {
3467         unsigned chunks_max = 2, speed_max = 0;
3468         struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3469         struct stripe *stripe;
3470         unsigned io_size = 0, speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
3471
3472         BUG_ON(list_empty(&rs->recover.stripes));
3473 #ifndef DMRAID45_XOR_TEST
3474         stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3475                                   lists[LIST_RECOVER]);
3476 #endif
3477
3478         /* Try all xor functions. */
3479         while (f-- > xor_funcs) {
3480                 unsigned speed;
3481
3482 #ifdef DMRAID45_XOR_TEST
3483                 list_for_each_entry(stripe, &rs->recover.stripes,
3484                                     lists[LIST_RECOVER]) {
3485                         io_size = stripe->io.size;
3486 #endif
3487
3488                         /* Set actual xor function for common_xor(). */
3489                         rs->xor.f = f;
3490                         rs->xor.chunks = (f->f == xor_blocks_wrapper ?
3491                                           (MAX_XOR_BLOCKS + 1) :
3492                                           XOR_CHUNKS_MAX);
3493                         if (rs->xor.chunks > rs->set.raid_devs)
3494                                 rs->xor.chunks = rs->set.raid_devs;
3495
3496                         for ( ; rs->xor.chunks > 1; rs->xor.chunks--) {
3497                                 speed = xor_speed(stripe);
3498
3499 #ifdef DMRAID45_XOR_TEST
3500                                 if (f->f == xor_blocks_wrapper) {
3501                                         if (speed > speed_xor_blocks)
3502                                                 speed_xor_blocks = speed;
3503                                 } else if (speed > speed_hm)
3504                                         speed_hm = speed;
3505
3506                                 if (speed < speed_min)
3507                                         speed_min = speed;
3508 #endif
3509
3510                                 if (speed > speed_max) {
3511                                         speed_max = speed;
3512                                         chunks_max = rs->xor.chunks;
3513                                         f_max = f;
3514                                 }
3515                         }
3516 #ifdef DMRAID45_XOR_TEST
3517                 }
3518 #endif
3519         }
3520
3521         /* Memorize optimal parameters. */
3522         rs->xor.f = f_max;
3523         rs->xor.chunks = chunks_max;
3524 #ifdef DMRAID45_XOR_TEST
3525         DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
3526                speed_max == speed_hm ? "HM" : "NB",
3527                rs->recover.recovery_stripes, io_size, speed_min,
3528                speed_xor_blocks, speed_hm, speed_max);
3529 #endif
3530         return speed_max;
3531 }
3532
3533 /*
3534  * Allocate a RAID context (a RAID set)
3535  */
3536 /* Structure for variable RAID parameters. */
3537 struct variable_parms {
3538         int bandwidth;
3539         int bandwidth_parm;
3540         int chunk_size;
3541         int chunk_size_parm;
3542         int io_size;
3543         int io_size_parm;
3544         int stripes;
3545         int stripes_parm;
3546         int recover_io_size;
3547         int recover_io_size_parm;
3548         int raid_parms;
3549         int recovery;
3550         int recovery_stripes;
3551         int recovery_stripes_parm;
3552 };
3553
3554 static struct raid_set *
3555 context_alloc(struct raid_type *raid_type, struct variable_parms *p,
3556               unsigned raid_devs, sector_t sectors_per_dev,
3557               struct dm_target *ti, unsigned dl_parms, char **argv)
3558 {
3559         int r;
3560         size_t len;
3561         sector_t region_size, ti_len;
3562         struct raid_set *rs = NULL;
3563         struct dm_dirty_log *dl;
3564         struct recover *rec;
3565
3566         /*
3567          * Create the dirty log
3568          *
3569          * We need to change length for the dirty log constructor,
3570          * because we want an amount of regions for all stripes derived
3571          * from the single device size, so that we can keep region
3572          * size = 2^^n independant of the number of devices
3573          */
3574         ti_len = ti->len;
3575         ti->len = sectors_per_dev;
3576         dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
3577         ti->len = ti_len;
3578         if (!dl)
3579                 goto bad_dirty_log;
3580
3581         /* Chunk size *must* be smaller than region size. */
3582         region_size = dl->type->get_region_size(dl);
3583         if (p->chunk_size > region_size)
3584                 goto bad_chunk_size;
3585
3586         /* Recover io size *must* be smaller than region size as well. */
3587         if (p->recover_io_size > region_size)
3588                 goto bad_recover_io_size;
3589
3590         /* Size and allocate the RAID set structure. */
3591         len = sizeof(*rs->data) + sizeof(*rs->dev);
3592         if (dm_array_too_big(sizeof(*rs), len, raid_devs))
3593                 goto bad_array;
3594
3595         len = sizeof(*rs) + raid_devs * len;
3596         rs = kzalloc(len, GFP_KERNEL);
3597         if (!rs)
3598                 goto bad_alloc;
3599
3600         rec = &rs->recover;
3601         atomic_set(&rs->io.in_process, 0);
3602         atomic_set(&rs->io.in_process_max, 0);
3603         rec->io_size = p->recover_io_size;
3604
3605         /* Pointer to data array. */
3606         rs->data = (unsigned long **)
3607                    ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
3608         rec->dl = dl;
3609         rs->set.raid_devs = raid_devs;
3610         rs->set.data_devs = raid_devs - raid_type->parity_devs;
3611         rs->set.raid_type = raid_type;
3612
3613         rs->set.raid_parms = p->raid_parms;
3614         rs->set.chunk_size_parm = p->chunk_size_parm;
3615         rs->set.io_size_parm = p->io_size_parm;
3616         rs->sc.stripes_parm = p->stripes_parm;
3617         rec->io_size_parm = p->recover_io_size_parm;
3618         rec->bandwidth_parm = p->bandwidth_parm;
3619         rec->recovery = p->recovery;
3620         rec->recovery_stripes = p->recovery_stripes;
3621
3622         /*
3623          * Set chunk and io size and respective shifts
3624          * (used to avoid divisions)
3625          */
3626         rs->set.chunk_size = p->chunk_size;
3627         rs->set.chunk_shift = ffs(p->chunk_size) - 1;
3628
3629         rs->set.io_size = p->io_size;
3630         rs->set.io_mask = p->io_size - 1;
3631         /* Mask to adjust address key in case io_size != chunk_size. */
3632         rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
3633
3634         rs->set.sectors_per_dev = sectors_per_dev;
3635
3636         rs->set.ei = -1;        /* Indicate no failed device. */
3637         atomic_set(&rs->set.failed_devs, 0);
3638
3639         rs->ti = ti;
3640
3641         atomic_set(rec->io_count + IO_WORK, 0);
3642         atomic_set(rec->io_count + IO_RECOVER, 0);
3643
3644         /* Initialize io lock and queues. */
3645         mutex_init(&rs->io.in_lock);
3646         mutex_init(&rs->io.xor_lock);
3647         bio_list_init(&rs->io.in);
3648         bio_list_init(&rs->io.work);
3649
3650         init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
3651
3652         rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
3653         rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
3654                         wake_dummy, wake_do_raid, 0, p->recovery_stripes,
3655                         dl, region_size, rec->nr_regions);
3656         if (IS_ERR(rec->rh))
3657                 goto bad_rh;
3658
3659         /* Initialize stripe cache. */
3660         r = sc_init(rs, p->stripes);
3661         if (r)
3662                 goto bad_sc;
3663
3664         /* REMOVEME: statistics. */
3665         stats_reset(rs);
3666         ClearRSDevelStats(rs);  /* Disnable development status. */
3667         return rs;
3668
3669 bad_dirty_log:
3670         TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
3671
3672 bad_chunk_size:
3673         dm_dirty_log_destroy(dl);
3674         TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
3675
3676 bad_recover_io_size:
3677         dm_dirty_log_destroy(dl);
3678         TI_ERR_RET("Recover stripe io size larger than region size",
3679                         ERR_PTR(-EINVAL));
3680
3681 bad_array:
3682         dm_dirty_log_destroy(dl);
3683         TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
3684
3685 bad_alloc:
3686         dm_dirty_log_destroy(dl);
3687         TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
3688
3689 bad_rh:
3690         dm_dirty_log_destroy(dl);
3691         ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
3692         goto free_rs;
3693
3694 bad_sc:
3695         dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
3696         sc_exit(&rs->sc);
3697         ti->error = DM_MSG_PREFIX "Error creating stripe cache";
3698 free_rs:
3699         kfree(rs);
3700         return ERR_PTR(-ENOMEM);
3701 }
3702
3703 /* Free a RAID context (a RAID set). */
3704 static void context_free(struct raid_set *rs, unsigned p)
3705 {
3706         while (p--)
3707                 dm_put_device(rs->ti, rs->dev[p].dev);
3708
3709         sc_exit(&rs->sc);
3710         dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
3711         kfree(rs);
3712 }
3713
3714 /* Create work queue and initialize delayed work. */
3715 static int rs_workqueue_init(struct raid_set *rs)
3716 {
3717         struct dm_target *ti = rs->ti;
3718
3719         rs->io.wq = create_singlethread_workqueue(DAEMON);
3720         if (!rs->io.wq)
3721                 TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
3722
3723         INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
3724         INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
3725         return 0;
3726 }
3727
3728 /* Return pointer to raid_type structure for raid name. */
3729 static struct raid_type *get_raid_type(char *name)
3730 {
3731         struct raid_type *r = ARRAY_END(raid_types);
3732
3733         while (r-- > raid_types) {
3734                 if (!strcmp(r->name, name))
3735                         return r;
3736         }
3737
3738         return NULL;
3739 }
3740
3741 /* FIXME: factor out to dm core. */
3742 static int multiple(sector_t a, sector_t b, sector_t *n)
3743 {
3744         sector_t r = a;
3745
3746         sector_div(r, b);
3747         *n = r;
3748         return a == r * b;
3749 }
3750
3751 /* Log RAID set information to kernel log. */
3752 static void rs_log(struct raid_set *rs, unsigned io_size)
3753 {
3754         unsigned p;
3755         char buf[BDEVNAME_SIZE];
3756
3757         for (p = 0; p < rs->set.raid_devs; p++)
3758                 DMINFO("/dev/%s is raid disk %u%s",
3759                                 bdevname(rs->dev[p].dev->bdev, buf), p,
3760                                 (p == rs->set.pi) ? " (parity)" : "");
3761
3762         DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3763                "algorithm \"%s\", %u chunks with %uMB/s\n"
3764                "%s set with net %u/%u devices",
3765                rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
3766                atomic_read(&rs->sc.stripes),
3767                rs->xor.f->name, rs->xor.chunks, mbpers(rs, io_size),
3768                rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
3769 }
3770
3771 /* Get all devices and offsets. */
3772 static int dev_parms(struct raid_set *rs, char **argv, int *p)
3773 {
3774         struct dm_target *ti = rs->ti;
3775
3776 DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs->set.sectors_per_dev);
3777         for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
3778                 int r;
3779                 unsigned long long tmp;
3780                 struct raid_dev *dev = rs->dev + *p;
3781
3782                 /* Get offset and device. */
3783                 if (sscanf(argv[1], "%llu", &tmp) != 1 ||
3784                     tmp > rs->set.sectors_per_dev)
3785                         TI_ERR("Invalid RAID device offset parameter");
3786
3787                 dev->start = tmp;
3788                 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
3789                                   &dev->dev);
3790                 if (r)
3791                         TI_ERR_RET("RAID device lookup failure", r);
3792
3793                 r = raid_dev_lookup(rs, dev);
3794                 if (r != -ENODEV && r < *p) {
3795                         (*p)++; /* Ensure dm_put_device() on actual device. */
3796                         TI_ERR_RET("Duplicate RAID device", -ENXIO);
3797                 }
3798         }
3799
3800         return 0;
3801 }
3802
3803 /* Set recovery bandwidth. */
3804 static void
3805 recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
3806 {
3807         rs->recover.bandwidth = bandwidth;
3808         rs->recover.bandwidth_work = 100 / bandwidth;
3809 }
3810
3811 /* Handle variable number of RAID parameters. */
3812 static int get_raid_variable_parms(struct dm_target *ti, char **argv,
3813                                    struct variable_parms *vp)
3814 {
3815         int p, value;
3816         struct {
3817                 int action; /* -1: skip, 0: no power2 check, 1: power2 check */
3818                 char *errmsg;
3819                 int min, max;
3820                 int *var, *var2, *var3;
3821         } argctr[] = {
3822                 { 1,
3823                   "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3824                   IO_SIZE_MIN, CHUNK_SIZE_MAX,
3825                   &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
3826                 { 0,
3827                   "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3828                   STRIPES_MIN, STRIPES_MAX,
3829                   &vp->stripes_parm, &vp->stripes, NULL },
3830                 { 1,
3831                   "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3832                   "min(BIO_MAX_SECTORS/2, chunk size)",
3833                   IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
3834                   &vp->io_size_parm, &vp->io_size, NULL },
3835                 { 1,
3836                   "Invalid recovery io size; must be -1 or "
3837                   "2^^n and less equal BIO_MAX_SECTORS/2",
3838                   RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
3839                   &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
3840                 { 0,
3841                   "Invalid recovery bandwidth percentage; "
3842                   "must be -1 or > 0 and <= 100",
3843                   BANDWIDTH_MIN, BANDWIDTH_MAX,
3844                   &vp->bandwidth_parm, &vp->bandwidth, NULL },
3845                 /* Handle sync argument seperately in loop. */
3846                 { -1,
3847                   "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3848                 { 0,
3849                   "Invalid number of recovery stripes;"
3850                   "must be -1, > 0 and <= 64",
3851                   RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
3852                   &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
3853         }, *varp;
3854
3855         /* Fetch # of variable raid parameters. */
3856         if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
3857             !range_ok(vp->raid_parms, 0, 7))
3858                 TI_ERR("Bad variable raid parameters number");
3859
3860         /* Preset variable RAID parameters. */
3861         vp->chunk_size = CHUNK_SIZE_DEFAULT;
3862         vp->io_size = IO_SIZE_DEFAULT;
3863         vp->stripes = STRIPES_DEFAULT;
3864         vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
3865         vp->bandwidth = BANDWIDTH_DEFAULT;
3866         vp->recovery = 1;
3867         vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
3868
3869         /* Walk the array of argument constraints for all given ones. */
3870         for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
3871                 BUG_ON(varp >= ARRAY_END(argctr));
3872
3873                 /* Special case for "[no]sync" string argument. */
3874                 if (varp->action < 0) {
3875                         if (!strcmp(*argv, "sync"))
3876                                 ;
3877                         else if (!strcmp(*argv, "nosync"))
3878                                 vp->recovery = 0;
3879                         else
3880                                 TI_ERR(varp->errmsg);
3881
3882                         argv++;
3883                         continue;
3884                 }
3885
3886                 /*
3887                  * Special case for io_size depending
3888                  * on previously set chunk size.
3889                  */
3890                 if (p == 2)
3891                         varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
3892
3893                 if (sscanf(*(argv++), "%d", &value) != 1 ||
3894                     (value != -1 &&
3895                      ((varp->action && !is_power_of_2(value)) ||
3896                       !range_ok(value, varp->min, varp->max))))
3897                         TI_ERR(varp->errmsg);
3898
3899                 *varp->var = value;
3900                 if (value != -1) {
3901                         if (varp->var2)
3902                                 *varp->var2 = value;
3903                         if (varp->var3)
3904                                 *varp->var3 = value;
3905                 }
3906         }
3907
3908         return 0;
3909 }
3910
3911 /* Parse optional locking parameters. */
3912 static int get_raid_locking_parms(struct dm_target *ti, char **argv,
3913                                   int *locking_parms,
3914                                   struct dm_raid45_locking_type **locking_type)
3915 {
3916         if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
3917                 char *lckstr = argv[1];
3918                 size_t lcksz = strlen(lckstr);
3919
3920                 if (!strnicmp(lckstr, "none", lcksz)) {
3921                         *locking_type = &locking_none;
3922                         *locking_parms = 2;
3923                 } else if (!strnicmp(lckstr, "cluster", lcksz)) {
3924                         DMERR("locking type \"%s\" not yet implemented",
3925                               lckstr);
3926                         return -EINVAL;
3927                 } else {
3928                         DMERR("unknown locking type \"%s\"", lckstr);
3929                         return -EINVAL;
3930                 }
3931         }
3932
3933         *locking_parms = 0;
3934         *locking_type = &locking_none;
3935         return 0;
3936 }
3937
3938 /* Set backing device read ahead properties of RAID set. */
3939 static void rs_set_read_ahead(struct raid_set *rs,
3940                               unsigned sectors, unsigned stripes)
3941 {
3942         unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
3943         struct mapped_device *md = dm_table_get_md(rs->ti->table);
3944         struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3945
3946         /* Set read-ahead for the RAID set and the component devices. */
3947         if (ra_pages) {
3948                 unsigned p = rs->set.raid_devs;
3949
3950                 bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
3951
3952                 while (p--) {
3953                         struct request_queue *q =
3954                                 bdev_get_queue(rs->dev[p].dev->bdev);
3955
3956                         q->backing_dev_info.ra_pages = ra_pages;
3957                 }
3958         }
3959 }
3960
3961 /* Set congested function. */
3962 static void rs_set_congested_fn(struct raid_set *rs)
3963 {
3964         struct mapped_device *md = dm_table_get_md(rs->ti->table);
3965         struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3966
3967         /* Set congested function and data. */
3968         bdi->congested_fn = rs_congested;
3969         bdi->congested_data = rs;
3970 }
3971
3972 /*
3973  * Construct a RAID4/5 mapping:
3974  *
3975  * log_type #log_params <log_params> \
3976  * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3977  * [locking "none"/"cluster"]
3978  * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3979  *
3980  * log_type = "core"/"disk",
3981  * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3982  * log_params = [dirty_log_path] region_size [[no]sync])
3983  *
3984  * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3985  *
3986  * #parity_dev = N if raid_type = "raid4"
3987  * o N = -1: pick default = last device
3988  * o N >= 0 and < #raid_devs: parity device index
3989  *
3990  * #raid_variable_params = 0-7; raid_params (-1 = default):
3991  *   [chunk_size [#stripes [io_size [recover_io_size \
3992  *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3993  *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3994  *     and <= CHUNK_SIZE_MAX)
3995  *   o #stripes is number of stripes allocated to stripe cache
3996  *     (must be > 1 and < STRIPES_MAX)
3997  *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3998  *   o recover_io_size (io unit size per device for recovery in sectors;
3999  must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4000  *   o %recovery_bandwith is the maximum amount spend for recovery during
4001  *     application io (1-100%)
4002  *   o recovery switch = [sync|nosync]
4003  *   o #recovery_stripes is the number of recovery stripes used for
4004  *     parallel recovery of the RAID set
4005  * If raid_variable_params = 0, defaults will be used.
4006  * Any raid_variable_param can be set to -1 to apply a default
4007  *
4008  * #raid_devs = N (N >= 3)
4009  *
4010  * #dev_to_initialize = N
4011  * -1: initialize parity on all devices
4012  * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4013  * of a failed devices content after replacement
4014  *
4015  * <dev_path> = device_path (eg, /dev/sdd1)
4016  * <offset>   = begin at offset on <dev_path>
4017  *
4018  */
4019 #define MIN_PARMS       13
4020 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
4021 {
4022         int dev_to_init, dl_parms, i, locking_parms,
4023             parity_parm, pi = -1, r, raid_devs;
4024         sector_t tmp, sectors_per_dev;
4025         struct dm_raid45_locking_type *locking;
4026         struct raid_set *rs;
4027         struct raid_type *raid_type;
4028         struct variable_parms parms;
4029
4030         /* Ensure minimum number of parameters. */
4031         if (argc < MIN_PARMS)
4032                 TI_ERR("Not enough parameters");
4033
4034         /* Fetch # of dirty log parameters. */
4035         if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
4036             !range_ok(dl_parms, 1, 4711)) /* ;-) */
4037                 TI_ERR("Bad dirty log parameters number");
4038
4039         /* Check raid_type. */
4040         raid_type = get_raid_type(argv[dl_parms + 2]);
4041         if (!raid_type)
4042                 TI_ERR("Bad raid type");
4043
4044         /* In case of RAID4, parity drive is selectable. */
4045         parity_parm = !!(raid_type->level == raid4);
4046
4047         /* Handle variable number of RAID parameters. */
4048         r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
4049                                     &parms);
4050         if (r)
4051                 return r;
4052
4053         /* Handle any locking parameters. */
4054         r = get_raid_locking_parms(ti,
4055                                    argv + dl_parms + parity_parm +
4056                                    parms.raid_parms + 4,
4057                                    &locking_parms, &locking);
4058         if (r)
4059                 return r;
4060
4061         /* # of raid devices. */
4062         i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
4063         if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
4064             raid_devs < raid_type->minimal_devs)
4065                 TI_ERR("Invalid number of raid devices");
4066
4067         /* In case of RAID4, check parity drive index is in limits. */
4068         if (raid_type->level == raid4) {
4069                 /* Fetch index of parity device. */
4070                 if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
4071                     (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
4072                         TI_ERR("Invalid RAID4 parity device index");
4073         }
4074
4075         /*
4076          * Index of device to initialize starts at 0
4077          *
4078          * o -1 -> don't initialize a selected device;
4079          *         initialize parity conforming to algorithm
4080          * o 0..raid_devs-1 -> initialize respective device
4081          *   (used for reconstruction of a replaced device)
4082          */
4083         if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
4084                    locking_parms + 5], "%d", &dev_to_init) != 1 ||
4085             !range_ok(dev_to_init, -1, raid_devs - 1))
4086                 TI_ERR("Invalid number for raid device to initialize");
4087
4088         /* Check # of raid device arguments. */
4089         if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
4090             2 * raid_devs)
4091                 TI_ERR("Wrong number of raid device/offset arguments");
4092
4093         /*
4094          * Check that the table length is devisable
4095          * w/o rest by (raid_devs - parity_devs)
4096          */
4097         if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4098                       &sectors_per_dev))
4099                 TI_ERR("Target length not divisible by number of data devices");
4100
4101         /*
4102          * Check that the device size is
4103          * devisable w/o rest by chunk size
4104          */
4105         if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
4106                 TI_ERR("Device length not divisible by chunk_size");
4107
4108         /****************************************************************
4109          * Now that we checked the constructor arguments ->
4110          * let's allocate the RAID set
4111          ****************************************************************/
4112         rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
4113                            ti, dl_parms, argv);
4114         if (IS_ERR(rs))
4115                 return PTR_ERR(rs);
4116
4117
4118         rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
4119         rs->set.pi = rs->set.pi_parm = pi;
4120
4121         /* Set RAID4 parity drive index. */
4122         if (raid_type->level == raid4)
4123                 rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4124
4125         recover_set_bandwidth(rs, parms.bandwidth);
4126
4127         /* Use locking type to lock stripe access. */
4128         rs->locking = locking;
4129
4130         /* Get the device/offset tupels. */
4131         argv += dl_parms + 6 + parity_parm + parms.raid_parms;
4132         r = dev_parms(rs, argv, &i);
4133         if (r)
4134                 goto err;
4135
4136         /* Set backing device information (eg. read ahead). */
4137         rs_set_read_ahead(rs, 2 * rs->set.chunk_size /* sectors per device */,
4138                               2 /* # of stripes */);
4139         rs_set_congested_fn(rs); /* Set congested function. */
4140         SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4141         rs->xor.speed = xor_optimize(rs); /* Select best xor algorithm. */
4142
4143         /* Set for recovery of any nosync regions. */
4144         if (parms.recovery)
4145                 SetRSRecover(rs);
4146         else {
4147                 /*
4148                  * Need to free recovery stripe(s) here in case
4149                  * of nosync, because xor_optimize uses one.
4150                  */
4151                 set_start_recovery(rs);
4152                 set_end_recovery(rs);
4153                 stripe_recover_free(rs);
4154         }
4155
4156         /*
4157          * Enable parity chunk creation enformcement for
4158          * little numbers of array members where it doesn'ti
4159          * gain us performance to xor parity out and back in as
4160          * with larger array member numbers.
4161          */
4162         if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
4163                 SetRSEnforceParityCreation(rs);
4164
4165         /*
4166          * Make sure that dm core only hands maximum io size
4167          * length down and pays attention to io boundaries.
4168          */
4169         ti->split_io = rs->set.io_size;
4170         ti->private = rs;
4171
4172         /* Initialize work queue to handle this RAID set's io. */
4173         r = rs_workqueue_init(rs);
4174         if (r)
4175                 goto err;
4176
4177         rs_log(rs, rs->recover.io_size); /* Log information about RAID set. */
4178         return 0;
4179
4180 err:
4181         context_free(rs, i);
4182         return r;
4183 }
4184
4185 /*
4186  * Destruct a raid mapping
4187  */
4188 static void raid_dtr(struct dm_target *ti)
4189 {
4190         struct raid_set *rs = ti->private;
4191
4192         destroy_workqueue(rs->io.wq);
4193         context_free(rs, rs->set.raid_devs);
4194 }
4195
4196 /* Raid mapping function. */
4197 static int raid_map(struct dm_target *ti, struct bio *bio,
4198                     union map_info *map_context)
4199 {
4200         /* I don't want to waste stripe cache capacity. */
4201         if (bio_rw(bio) == READA)
4202                 return -EIO;
4203         else {
4204                 struct raid_set *rs = ti->private;
4205
4206                 /*
4207                  * Get io reference to be waiting for to drop
4208                  * to zero on device suspension/destruction.
4209                  */
4210                 io_get(rs);
4211                 bio->bi_sector -= ti->begin;    /* Remap sector. */
4212
4213                 /* Queue io to RAID set. */
4214                 mutex_lock(&rs->io.in_lock);
4215                 bio_list_add(&rs->io.in, bio);
4216                 mutex_unlock(&rs->io.in_lock);
4217
4218                 /* Wake daemon to process input list. */
4219                 wake_do_raid(rs);
4220
4221                 /* REMOVEME: statistics. */
4222                 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
4223                                         S_BIOS_READ : S_BIOS_WRITE));
4224                 return DM_MAPIO_SUBMITTED;      /* Handle later. */
4225         }
4226 }
4227
4228 /* Device suspend. */
4229 static void raid_presuspend(struct dm_target *ti)
4230 {
4231         struct raid_set *rs = ti->private;
4232         struct dm_dirty_log *dl = rs->recover.dl;
4233
4234         SetRSSuspend(rs);
4235
4236         if (RSRecover(rs))
4237                 dm_rh_stop_recovery(rs->recover.rh);
4238
4239         cancel_delayed_work(&rs->io.dws_do_raid);
4240         flush_workqueue(rs->io.wq);
4241         wait_ios(rs);   /* Wait for completion of all ios being processed. */
4242
4243         if (dl->type->presuspend && dl->type->presuspend(dl))
4244                 /* FIXME: need better error handling. */
4245                 DMWARN("log presuspend failed");
4246 }
4247
4248 static void raid_postsuspend(struct dm_target *ti)
4249 {
4250         struct raid_set *rs = ti->private;
4251         struct dm_dirty_log *dl = rs->recover.dl;
4252
4253         if (dl->type->postsuspend && dl->type->postsuspend(dl))
4254                 /* FIXME: need better error handling. */
4255                 DMWARN("log postsuspend failed");
4256
4257 }
4258
4259 /* Device resume. */
4260 static void raid_resume(struct dm_target *ti)
4261 {
4262         struct raid_set *rs = ti->private;
4263         struct recover *rec = &rs->recover;
4264         struct dm_dirty_log *dl = rec->dl;
4265
4266 DMINFO("%s...", __func__);
4267         if (dl->type->resume && dl->type->resume(dl))
4268                 /* Resume dirty log. */
4269                 /* FIXME: need better error handling. */
4270                 DMWARN("log resume failed");
4271
4272         rec->nr_regions_to_recover =
4273                 rec->nr_regions - dl->type->get_sync_count(dl);
4274
4275         /* Restart any unfinished recovery. */
4276         if (RSRecover(rs)) {
4277                 set_start_recovery(rs);
4278                 dm_rh_start_recovery(rec->rh);
4279         }
4280
4281         ClearRSSuspend(rs);
4282 }
4283
4284 /* Return stripe cache size. */
4285 static unsigned sc_size(struct raid_set *rs)
4286 {
4287         return to_sector(atomic_read(&rs->sc.stripes) *
4288                          (sizeof(struct stripe) +
4289                           (sizeof(struct stripe_chunk) +
4290                            (sizeof(struct page_list) +
4291                             to_bytes(rs->set.io_size) *
4292                             rs->set.raid_devs)) +
4293                           (rs->recover.end_jiffies ?
4294                            0 : rs->recover.recovery_stripes *
4295                            to_bytes(rs->set.raid_devs * rs->recover.io_size))));
4296 }
4297
4298 /* REMOVEME: status output for development. */
4299 static void raid_devel_stats(struct dm_target *ti, char *result,
4300                              unsigned *size, unsigned maxlen)
4301 {
4302         unsigned sz = *size;
4303         unsigned long j;
4304         char buf[BDEVNAME_SIZE], *p;
4305         struct stats_map *sm;
4306         struct raid_set *rs = ti->private;
4307         struct recover *rec = &rs->recover;
4308         struct timespec ts;
4309
4310         DMEMIT("%s %s=%u bw=%u\n",
4311                version, rs->xor.f->name, rs->xor.chunks, rs->recover.bandwidth);
4312         DMEMIT("act_ios=%d ", io_ref(rs));
4313         DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
4314         DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
4315         DMEMIT("act_stripes_max=%d\n",
4316                atomic_read(&rs->sc.active_stripes_max));
4317
4318         for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
4319                 DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4320
4321         DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
4322         DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
4323                atomic_read(&rs->sc.stripes), rs->set.io_size,
4324                rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
4325                sc_size(rs));
4326
4327         j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4328             rec->start_jiffies;
4329         jiffies_to_timespec(j, &ts);
4330         sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4331         p = strchr(buf, '.');
4332         p[3] = 0;
4333
4334         DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4335                (unsigned long long) rec->nr_regions_recovered,
4336                (unsigned long long) rec->nr_regions_to_recover,
4337                (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4338
4339         *size = sz;
4340 }
4341
4342 static int raid_status(struct dm_target *ti, status_type_t type,
4343                        char *result, unsigned maxlen)
4344 {
4345         unsigned p, sz = 0;
4346         char buf[BDEVNAME_SIZE];
4347         struct raid_set *rs = ti->private;
4348         struct dm_dirty_log *dl = rs->recover.dl;
4349         int raid_parms[] = {
4350                 rs->set.chunk_size_parm,
4351                 rs->sc.stripes_parm,
4352                 rs->set.io_size_parm,
4353                 rs->recover.io_size_parm,
4354                 rs->recover.bandwidth_parm,
4355                 -2,
4356                 rs->recover.recovery_stripes,
4357         };
4358
4359         switch (type) {
4360         case STATUSTYPE_INFO:
4361                 /* REMOVEME: statistics. */
4362                 if (RSDevelStats(rs))
4363                         raid_devel_stats(ti, result, &sz, maxlen);
4364
4365                 DMEMIT("%u ", rs->set.raid_devs);
4366
4367                 for (p = 0; p < rs->set.raid_devs; p++)
4368                         DMEMIT("%s ",
4369                                format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
4370
4371                 DMEMIT("2 ");
4372                 for (p = 0; p < rs->set.raid_devs; p++) {
4373                         DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
4374
4375                         if (p == rs->set.pi)
4376                                 DMEMIT("p");
4377
4378                         if (p == rs->set.dev_to_init)
4379                                 DMEMIT("i");
4380                 }
4381
4382                 DMEMIT(" %llu/%llu ",
4383                       (unsigned long long) dl->type->get_sync_count(dl),
4384                       (unsigned long long) rs->recover.nr_regions);
4385
4386                 sz += dl->type->status(dl, type, result+sz, maxlen-sz);
4387                 break;
4388         case STATUSTYPE_TABLE:
4389                 sz = rs->recover.dl->type->status(rs->recover.dl, type,
4390                                                   result, maxlen);
4391                 DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
4392
4393                 for (p = 0; p < rs->set.raid_parms; p++) {
4394                         if (raid_parms[p] > -2)
4395                                 DMEMIT("%d ", raid_parms[p]);
4396                         else
4397                                 DMEMIT("%s ", rs->recover.recovery ?
4398                                               "sync" : "nosync");
4399                 }
4400
4401                 DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4402
4403                 for (p = 0; p < rs->set.raid_devs; p++)
4404                         DMEMIT("%s %llu ",
4405                                format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
4406                                (unsigned long long) rs->dev[p].start);
4407         }
4408
4409         return 0;
4410 }
4411
4412 /*
4413  * Message interface
4414  */
4415 /* Turn a delta into an absolute value. */
4416 static int _absolute(char *action, int act, int r)
4417 {
4418         size_t len = strlen(action);
4419
4420         if (len < 2)
4421                 len = 2;
4422
4423         /* Make delta absolute. */
4424         if (!strncmp("set", action, len))
4425                 ;
4426         else if (!strncmp("grow", action, len))
4427                 r += act;
4428         else if (!strncmp("shrink", action, len))
4429                 r = act - r;
4430         else
4431                 r = -EINVAL;
4432
4433         return r;
4434 }
4435
4436  /* Change recovery io bandwidth. */
4437 static int bandwidth_change(struct raid_set *rs, int argc, char **argv,
4438                             enum raid_set_flags flag)
4439 {
4440         int act = rs->recover.bandwidth, bandwidth;
4441
4442         if (argc != 2)
4443                 return -EINVAL;
4444
4445         if (sscanf(argv[1], "%d", &bandwidth) == 1 &&
4446             range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4447                 /* Make delta bandwidth absolute. */
4448                 bandwidth = _absolute(argv[0], act, bandwidth);
4449
4450                 /* Check range. */
4451                 if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4452                         recover_set_bandwidth(rs, bandwidth);
4453                         return 0;
4454                 }
4455         }
4456
4457         return -EINVAL;
4458 }
4459
4460 /* Set/reset development feature flags. */
4461 static int devel_flags(struct raid_set *rs, int argc, char **argv,
4462                        enum raid_set_flags flag)
4463 {
4464         size_t len;
4465
4466         if (argc != 1)
4467                 return -EINVAL;
4468
4469         len = strlen(argv[0]);
4470         if (len < 2)
4471                 len = 2;
4472
4473         if (!strncmp(argv[0], "on", len))
4474                 return test_and_set_bit(flag, &rs->io.flags) ? -EPERM : 0;
4475         else if (!strncmp(argv[0], "off", len))
4476                 return test_and_clear_bit(flag, &rs->io.flags) ? 0 : -EPERM;
4477         else if (!strncmp(argv[0], "reset", len)) {
4478                 if (flag == RS_DEVEL_STATS) {
4479                         if  (test_bit(flag, &rs->io.flags)) {
4480                                 stats_reset(rs);
4481                                 return 0;
4482                         } else
4483                                 return -EPERM;
4484                 } else  {
4485                         set_bit(flag, &rs->io.flags);
4486                         return 0;
4487                 }
4488         }
4489
4490         return -EINVAL;
4491 }
4492
4493 /* Resize the stripe cache. */
4494 static int sc_resize(struct raid_set *rs, int argc, char **argv,
4495                      enum raid_set_flags flag)
4496 {
4497         int act, stripes;
4498
4499         if (argc != 2)
4500                 return -EINVAL;
4501
4502         /* Deny permission in case the daemon is still resizing!. */
4503         if (atomic_read(&rs->sc.stripes_to_set))
4504                 return -EPERM;
4505
4506         if (sscanf(argv[1], "%d", &stripes) == 1 &&
4507             stripes > 0) {
4508                 act = atomic_read(&rs->sc.stripes);
4509
4510                 /* Make delta stripes absolute. */
4511                 stripes = _absolute(argv[0], act, stripes);
4512
4513                 /*
4514                  * Check range and that the # of stripes changes.
4515                  * We leave the resizing to the wroker.
4516                  */
4517                 if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
4518                     stripes != atomic_read(&rs->sc.stripes)) {
4519                         atomic_set(&rs->sc.stripes_to_set, stripes);
4520                         wake_do_raid(rs);
4521                         return 0;
4522                 }
4523         }
4524
4525         return -EINVAL;
4526 }
4527
4528 /* Change xor algorithm and number of chunks. */
4529 static int xor_set(struct raid_set *rs, int argc, char **argv,
4530                    enum raid_set_flags flag)
4531 {
4532         if (argc == 2) {
4533                 int chunks;
4534                 char *algorithm = argv[0];
4535                 struct xor_func *f = ARRAY_END(xor_funcs);
4536
4537                 if (sscanf(argv[1], "%d", &chunks) == 1 &&
4538                     range_ok(chunks, 2, XOR_CHUNKS_MAX) &&
4539                     chunks <= rs->set.raid_devs) {
4540                         while (f-- > xor_funcs) {
4541                                 if (!strcmp(algorithm, f->name)) {
4542                                         unsigned io_size = 0;
4543                                         struct stripe *stripe = stripe_alloc(&rs->sc, rs->sc.mem_cache_client, SC_GROW);
4544
4545                                         DMINFO("xor: %s", f->name);
4546                                         if (f->f == xor_blocks_wrapper &&
4547                                             chunks > MAX_XOR_BLOCKS + 1) {
4548                                                 DMERR("chunks > MAX_XOR_BLOCKS"
4549                                                       " + 1");
4550                                                 break;
4551                                         }
4552
4553                                         mutex_lock(&rs->io.xor_lock);
4554                                         rs->xor.f = f;
4555                                         rs->xor.chunks = chunks;
4556                                         rs->xor.speed = 0;
4557                                         mutex_unlock(&rs->io.xor_lock);
4558
4559                                         if (stripe) {
4560                                                 rs->xor.speed = xor_speed(stripe);
4561                                                 io_size = stripe->io.size;
4562                                                 stripe_free(stripe, rs->sc.mem_cache_client);
4563                                         }
4564
4565                                         rs_log(rs, io_size);
4566                                         return 0;
4567                                 }
4568                         }
4569                 }
4570         }
4571
4572         return -EINVAL;
4573 }
4574
4575 /*
4576  * Allow writes after they got prohibited because of a device failure.
4577  *
4578  * This needs to be called after userspace updated metadata state
4579  * based on an event being thrown during device failure processing.
4580  */
4581 static int allow_writes(struct raid_set *rs, int argc, char **argv,
4582                         enum raid_set_flags flag)
4583 {
4584         if (TestClearRSProhibitWrites(rs)) {
4585 DMINFO("%s waking", __func__);
4586                 wake_do_raid(rs);
4587                 return 0;
4588         }
4589
4590         return -EPERM;
4591 }
4592
4593 /* Parse the RAID message. */
4594 /*
4595  * 'all[ow_writes]'
4596  * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'      # e.g 'ba se 50'
4597  * "o[verwrite]  {on,of[f],r[eset]}'            # e.g. 'o of'
4598  * 'sta[tistics] {on,of[f],r[eset]}'            # e.g. 'stat of'
4599  * 'str[ipecache] {se[t],g[row],sh[rink]} #'    # e.g. 'stripe set 1024'
4600  * 'xor algorithm #chunks'                      # e.g. 'xor xor_8 5'
4601  *
4602  */
4603 static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
4604 {
4605         if (argc) {
4606                 size_t len = strlen(argv[0]);
4607                 struct raid_set *rs = ti->private;
4608                 struct {
4609                         const char *name;
4610                         int (*f) (struct raid_set *rs, int argc, char **argv,
4611                                   enum raid_set_flags flag);
4612                         enum raid_set_flags flag;
4613                 } msg_descr[] = {
4614                         { "allow_writes", allow_writes, 0 },
4615                         { "bandwidth", bandwidth_change, 0 },
4616                         { "overwrite", devel_flags, RS_CHECK_OVERWRITE },
4617                         { "statistics", devel_flags, RS_DEVEL_STATS },
4618                         { "stripe_cache", sc_resize, 0 },
4619                         { "xor", xor_set, 0 },
4620                 }, *m = ARRAY_END(msg_descr);
4621
4622                 if (len < 3)
4623                         len = 3;
4624
4625                 while (m-- > msg_descr) {
4626                         if (!strncmp(argv[0], m->name, len))
4627                                 return m->f(rs, argc - 1, argv + 1, m->flag);
4628                 }
4629
4630         }
4631
4632         return -EINVAL;
4633 }
4634 /*
4635  * END message interface
4636  */
4637
4638 /* Provide io hints. */
4639 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
4640 {
4641         struct raid_set *rs = ti->private;
4642
4643         blk_limits_io_min(limits, rs->set.chunk_size);
4644         blk_limits_io_opt(limits, rs->set.chunk_size * rs->set.data_devs);
4645 }
4646
4647 static struct target_type raid_target = {
4648         .name = "raid45",
4649         .version = {1, 0, 0},
4650         .module = THIS_MODULE,
4651         .ctr = raid_ctr,
4652         .dtr = raid_dtr,
4653         .map = raid_map,
4654         .presuspend = raid_presuspend,
4655         .postsuspend = raid_postsuspend,
4656         .resume = raid_resume,
4657         .status = raid_status,
4658         .message = raid_message,
4659         .io_hints = raid_io_hints,
4660 };
4661
4662 static void init_exit(const char *bad_msg, const char *good_msg, int r)
4663 {
4664         if (r)
4665                 DMERR("Failed to %sregister target [%d]", bad_msg, r);
4666         else
4667                 DMINFO("%s %s", good_msg, version);
4668 }
4669
4670 static int __init dm_raid_init(void)
4671 {
4672         int r = dm_register_target(&raid_target);
4673
4674         init_exit("", "initialized", r);
4675         return r;
4676 }
4677
4678 static void __exit dm_raid_exit(void)
4679 {
4680         dm_unregister_target(&raid_target);
4681         init_exit("un", "exit", 0);
4682 }
4683
4684 /* Module hooks. */
4685 module_init(dm_raid_init);
4686 module_exit(dm_raid_exit);
4687
4688 MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
4689 MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
4690 MODULE_LICENSE("GPL");
4691 MODULE_ALIAS("dm-raid4");
4692 MODULE_ALIAS("dm-raid5");