fs/exofs/super.c

   1 /*
   2  * Copyright (C) 2005, 2006
   3  * Avishay Traeger (avishay@gmail.com)
   4  * Copyright (C) 2008, 2009
   5  * Boaz Harrosh <bharrosh@panasas.com>
   6  *
   7  * Copyrights for code taken from ext2:
   8  *     Copyright (C) 1992, 1993, 1994, 1995
   9  *     Remy Card (card@masi.ibp.fr)
  10  *     Laboratoire MASI - Institut Blaise Pascal
  11  *     Universite Pierre et Marie Curie (Paris VI)
  12  *     from
  13  *     linux/fs/minix/inode.c
  14  *     Copyright (C) 1991, 1992  Linus Torvalds
  15  *
  16  * This file is part of exofs.
  17  *
  18  * exofs is free software; you can redistribute it and/or modify
  19  * it under the terms of the GNU General Public License as published by
  20  * the Free Software Foundation.  Since it is based on ext2, and the only
  21  * valid version of GPL for the Linux kernel is version 2, the only valid
  22  * version of GPL for exofs is version 2.
  23  *
  24  * exofs is distributed in the hope that it will be useful,
  25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  27  * GNU General Public License for more details.
  28  *
  29  * You should have received a copy of the GNU General Public License
  30  * along with exofs; if not, write to the Free Software
  31  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  32  */
  33
  34 #include <linux/string.h>
  35 #include <linux/parser.h>
  36 #include <linux/vfs.h>
  37 #include <linux/random.h>
  38 #include <linux/module.h>
  39 #include <linux/exportfs.h>
  40 #include <linux/slab.h>
  41
  42 #include "exofs.h"
  43
  44 #define EXOFS_DBGMSG2(M...) do {} while (0)
  45
  46 /******************************************************************************
  47  * MOUNT OPTIONS
  48  *****************************************************************************/
  49
  50 /*
  51  * struct to hold what we get from mount options
  52  */
  53 struct exofs_mountopt {
  54         bool is_osdname;
  55         const char *dev_name;
  56         uint64_t pid;
  57         int timeout;
  58 };
  59
  60 /*
  61  * exofs-specific mount-time options.
  62  */
  63 enum { Opt_name, Opt_pid, Opt_to, Opt_err };
  64
  65 /*
  66  * Our mount-time options.  These should ideally be 64-bit unsigned, but the
  67  * kernel's parsing functions do not currently support that.  32-bit should be
  68  * sufficient for most applications now.
  69  */
  70 static match_table_t tokens = {
  71         {Opt_name, "osdname=%s"},
  72         {Opt_pid, "pid=%u"},
  73         {Opt_to, "to=%u"},
  74         {Opt_err, NULL}
  75 };
  76
  77 /*
  78  * The main option parsing method.  Also makes sure that all of the mandatory
  79  * mount options were set.
  80  */
  81 static int parse_options(char *options, struct exofs_mountopt *opts)
  82 {
  83         char *p;
  84         substring_t args[MAX_OPT_ARGS];
  85         int option;
  86         bool s_pid = false;
  87
  88         EXOFS_DBGMSG("parse_options %s\n", options);
  89         /* defaults */
  90         memset(opts, 0, sizeof(*opts));
  91         opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
  92
  93         while ((p = strsep(&options, ",")) != NULL) {
  94                 int token;
  95                 char str[32];
  96
  97                 if (!*p)
  98                         continue;
  99
 100                 token = match_token(p, tokens, args);
 101                 switch (token) {
 102                 case Opt_name:
 103                         opts->dev_name = match_strdup(&args[0]);
 104                         if (unlikely(!opts->dev_name)) {
 105                                 EXOFS_ERR("Error allocating dev_name");
 106                                 return -ENOMEM;
 107                         }
 108                         opts->is_osdname = true;
 109                         break;
 110                 case Opt_pid:
 111                         if (0 == match_strlcpy(str, &args[0], sizeof(str)))
 112                                 return -EINVAL;
 113                         opts->pid = simple_strtoull(str, NULL, 0);
 114                         if (opts->pid < EXOFS_MIN_PID) {
 115                                 EXOFS_ERR("Partition ID must be >= %u",
 116                                           EXOFS_MIN_PID);
 117                                 return -EINVAL;
 118                         }
 119                         s_pid = 1;
 120                         break;
 121                 case Opt_to:
 122                         if (match_int(&args[0], &option))
 123                                 return -EINVAL;
 124                         if (option <= 0) {
 125                                 EXOFS_ERR("Timout must be > 0");
 126                                 return -EINVAL;
 127                         }
 128                         opts->timeout = option * HZ;
 129                         break;
 130                 }
 131         }
 132
 133         if (!s_pid) {
 134                 EXOFS_ERR("Need to specify the following options:\n");
 135                 EXOFS_ERR("    -o pid=pid_no_to_use\n");
 136                 return -EINVAL;
 137         }
 138
 139         return 0;
 140 }
 141
 142 /******************************************************************************
 143  * INODE CACHE
 144  *****************************************************************************/
 145
 146 /*
 147  * Our inode cache.  Isn't it pretty?
 148  */
 149 static struct kmem_cache *exofs_inode_cachep;
 150
 151 /*
 152  * Allocate an inode in the cache
 153  */
 154 static struct inode *exofs_alloc_inode(struct super_block *sb)
 155 {
 156         struct exofs_i_info *oi;
 157
 158         oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
 159         if (!oi)
 160                 return NULL;
 161
 162         oi->vfs_inode.i_version = 1;
 163         return &oi->vfs_inode;
 164 }
 165
 166 static void exofs_i_callback(struct rcu_head *head)
 167 {
 168         struct inode *inode = container_of(head, struct inode, i_rcu);
 169         INIT_LIST_HEAD(&inode->i_dentry);
 170         kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
 171 }
 172
 173 /*
 174  * Remove an inode from the cache
 175  */
 176 static void exofs_destroy_inode(struct inode *inode)
 177 {
 178         call_rcu(&inode->i_rcu, exofs_i_callback);
 179 }
 180
 181 /*
 182  * Initialize the inode
 183  */
 184 static void exofs_init_once(void *foo)
 185 {
 186         struct exofs_i_info *oi = foo;
 187
 188         inode_init_once(&oi->vfs_inode);
 189 }
 190
 191 /*
 192  * Create and initialize the inode cache
 193  */
 194 static int init_inodecache(void)
 195 {
 196         exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
 197                                 sizeof(struct exofs_i_info), 0,
 198                                 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
 199                                 exofs_init_once);
 200         if (exofs_inode_cachep == NULL)
 201                 return -ENOMEM;
 202         return 0;
 203 }
 204
 205 /*
 206  * Destroy the inode cache
 207  */
 208 static void destroy_inodecache(void)
 209 {
 210         kmem_cache_destroy(exofs_inode_cachep);
 211 }
 212
 213 /******************************************************************************
 214  * Some osd helpers
 215  *****************************************************************************/
 216 void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
 217 {
 218         osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
 219 }
 220
 221 static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 222                     u64 offset, void *p, unsigned length)
 223 {
 224         struct osd_request *or = osd_start_request(od, GFP_KERNEL);
 225 /*      struct osd_sense_info osi = {.key = 0};*/
 226         int ret;
 227
 228         if (unlikely(!or)) {
 229                 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
 230                 return -ENOMEM;
 231         }
 232         ret = osd_req_read_kern(or, obj, offset, p, length);
 233         if (unlikely(ret)) {
 234                 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
 235                 goto out;
 236         }
 237
 238         ret = osd_finalize_request(or, 0, cred, NULL);
 239         if (unlikely(ret)) {
 240                 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
 241                 goto out;
 242         }
 243
 244         ret = osd_execute_request(or);
 245         if (unlikely(ret))
 246                 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
 247         /* osd_req_decode_sense(or, ret); */
 248
 249 out:
 250         osd_end_request(or);
 251         EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
 252                       "length=0x%llx dev=%p ret=>%d\n",
 253                       _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
 254         return ret;
 255 }
 256
 257 static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
 258         EXOFS_APAGE_SB_DATA,
 259         EXOFS_ATTR_SB_STATS,
 260         sizeof(struct exofs_sb_stats));
 261
 262 static int __sbi_read_stats(struct exofs_sb_info *sbi)
 263 {
 264         struct osd_attr attrs[] = {
 265                 [0] = g_attr_sb_stats,
 266         };
 267         struct ore_io_state *ios;
 268         int ret;
 269
 270         ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
 271         if (unlikely(ret)) {
 272                 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 273                 return ret;
 274         }
 275
 276         ios->in_attr = attrs;
 277         ios->in_attr_len = ARRAY_SIZE(attrs);
 278
 279         ret = ore_read(ios);
 280         if (unlikely(ret)) {
 281                 EXOFS_ERR("Error reading super_block stats => %d\n", ret);
 282                 goto out;
 283         }
 284
 285         ret = extract_attr_from_ios(ios, &attrs[0]);
 286         if (ret) {
 287                 EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
 288                 goto out;
 289         }
 290         if (attrs[0].len) {
 291                 struct exofs_sb_stats *ess;
 292
 293                 if (unlikely(attrs[0].len != sizeof(*ess))) {
 294                         EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
 295                                   "size(%d) != expected(%zd)\n",
 296                                   __func__, attrs[0].len, sizeof(*ess));
 297                         goto out;
 298                 }
 299
 300                 ess = attrs[0].val_ptr;
 301                 sbi->s_nextid = le64_to_cpu(ess->s_nextid);
 302                 sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
 303         }
 304
 305 out:
 306         ore_put_io_state(ios);
 307         return ret;
 308 }
 309
 310 static void stats_done(struct ore_io_state *ios, void *p)
 311 {
 312         ore_put_io_state(ios);
 313         /* Good thanks nothing to do anymore */
 314 }
 315
 316 /* Asynchronously write the stats attribute */
 317 int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
 318 {
 319         struct osd_attr attrs[] = {
 320                 [0] = g_attr_sb_stats,
 321         };
 322         struct ore_io_state *ios;
 323         int ret;
 324
 325         ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
 326         if (unlikely(ret)) {
 327                 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 328                 return ret;
 329         }
 330
 331         sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
 332         sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
 333         attrs[0].val_ptr = &sbi->s_ess;
 334
 335
 336         ios->done = stats_done;
 337         ios->private = sbi;
 338         ios->out_attr = attrs;
 339         ios->out_attr_len = ARRAY_SIZE(attrs);
 340
 341         ret = ore_write(ios);
 342         if (unlikely(ret)) {
 343                 EXOFS_ERR("%s: ore_write failed.\n", __func__);
 344                 ore_put_io_state(ios);
 345         }
 346
 347         return ret;
 348 }
 349
 350 /******************************************************************************
 351  * SUPERBLOCK FUNCTIONS
 352  *****************************************************************************/
 353 static const struct super_operations exofs_sops;
 354 static const struct export_operations exofs_export_ops;
 355
 356 /*
 357  * Write the superblock to the OSD
 358  */
 359 int exofs_sync_fs(struct super_block *sb, int wait)
 360 {
 361         struct exofs_sb_info *sbi;
 362         struct exofs_fscb *fscb;
 363         struct ore_comp one_comp;
 364         struct ore_components comps;
 365         struct ore_io_state *ios;
 366         int ret = -ENOMEM;
 367
 368         fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
 369         if (unlikely(!fscb))
 370                 return -ENOMEM;
 371
 372         sbi = sb->s_fs_info;
 373
 374         /* NOTE: We no longer dirty the super_block anywhere in exofs. The
 375          * reason we write the fscb here on unmount is so we can stay backwards
 376          * compatible with fscb->s_version == 1. (What we are not compatible
 377          * with is if a new version FS crashed and then we try to mount an old
 378          * version). Otherwise the exofs_fscb is read-only from mkfs time. All
 379          * the writeable info is set in exofs_sbi_write_stats() above.
 380          */
 381
 382         exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
 383
 384         ret = ore_get_io_state(&sbi->layout, &comps, &ios);
 385         if (unlikely(ret))
 386                 goto out;
 387
 388         lock_super(sb);
 389
 390         ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
 391         memset(fscb, 0, ios->length);
 392         fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
 393         fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
 394         fscb->s_magic = cpu_to_le16(sb->s_magic);
 395         fscb->s_newfs = 0;
 396         fscb->s_version = EXOFS_FSCB_VER;
 397
 398         ios->offset = 0;
 399         ios->kern_buff = fscb;
 400
 401         ret = ore_write(ios);
 402         if (unlikely(ret))
 403                 EXOFS_ERR("%s: ore_write failed.\n", __func__);
 404         else
 405                 sb->s_dirt = 0;
 406
 407
 408         unlock_super(sb);
 409 out:
 410         EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
 411         ore_put_io_state(ios);
 412         kfree(fscb);
 413         return ret;
 414 }
 415
 416 static void exofs_write_super(struct super_block *sb)
 417 {
 418         if (!(sb->s_flags & MS_RDONLY))
 419                 exofs_sync_fs(sb, 1);
 420         else
 421                 sb->s_dirt = 0;
 422 }
 423
 424 static void _exofs_print_device(const char *msg, const char *dev_path,
 425                                 struct osd_dev *od, u64 pid)
 426 {
 427         const struct osd_dev_info *odi = osduld_device_info(od);
 428
 429         printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
 430                 msg, dev_path ?: "", odi->osdname, _LLU(pid));
 431 }
 432
 433 void exofs_free_sbi(struct exofs_sb_info *sbi)
 434 {
 435         while (sbi->comps.numdevs) {
 436                 int i = --sbi->comps.numdevs;
 437                 struct osd_dev *od = sbi->comps.ods[i];
 438
 439                 if (od) {
 440                         sbi->comps.ods[i] = NULL;
 441                         osduld_put_device(od);
 442                 }
 443         }
 444         if (sbi->comps.ods != sbi->_min_one_dev)
 445                 kfree(sbi->comps.ods);
 446         kfree(sbi);
 447 }
 448
 449 /*
 450  * This function is called when the vfs is freeing the superblock.  We just
 451  * need to free our own part.
 452  */
 453 static void exofs_put_super(struct super_block *sb)
 454 {
 455         int num_pend;
 456         struct exofs_sb_info *sbi = sb->s_fs_info;
 457
 458         /* make sure there are no pending commands */
 459         for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 460              num_pend = atomic_read(&sbi->s_curr_pending)) {
 461                 wait_queue_head_t wq;
 462
 463                 printk(KERN_NOTICE "%s: !!Pending operations in flight. "
 464                        "This is a BUG. please report to osd-dev@open-osd.org\n",
 465                        __func__);
 466                 init_waitqueue_head(&wq);
 467                 wait_event_timeout(wq,
 468                                   (atomic_read(&sbi->s_curr_pending) == 0),
 469                                   msecs_to_jiffies(100));
 470         }
 471
 472         _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
 473                             sbi->one_comp.obj.partition);
 474
 475         bdi_destroy(&sbi->bdi);
 476         exofs_free_sbi(sbi);
 477         sb->s_fs_info = NULL;
 478 }
 479
 480 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 481                                     struct exofs_device_table *dt)
 482 {
 483         u64 stripe_length;
 484
 485         sbi->layout.stripe_unit =
 486                                 le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
 487         sbi->layout.group_width =
 488                                 le32_to_cpu(dt->dt_data_map.cb_group_width);
 489         sbi->layout.group_depth =
 490                                 le32_to_cpu(dt->dt_data_map.cb_group_depth);
 491         sbi->layout.mirrors_p1  =
 492                                 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
 493         sbi->layout.raid_algorithm  =
 494                                 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 495
 496 /* FIXME: Only raid0 for now. if not so, do not mount */
 497         if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) {
 498                 EXOFS_ERR("Only RAID_0 for now\n");
 499                 return -EINVAL;
 500         }
 501         if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) {
 502                 EXOFS_ERR("Data Map wrong, "
 503                           "numdevs=%d < group_width=%d * mirrors=%d\n",
 504                           numdevs, sbi->layout.group_width,
 505                           sbi->layout.mirrors_p1);
 506                 return -EINVAL;
 507         }
 508
 509         if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) {
 510                 EXOFS_ERR("Stripe Unit(0x%llx)"
 511                           " must be Multples of PAGE_SIZE(0x%lx)\n",
 512                           _LLU(sbi->layout.stripe_unit), PAGE_SIZE);
 513                 return -EINVAL;
 514         }
 515
 516         if (sbi->layout.group_width) {
 517                 if (!sbi->layout.group_depth) {
 518                         EXOFS_ERR("group_depth == 0 && group_width != 0\n");
 519                         return -EINVAL;
 520                 }
 521                 sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 /
 522                                                 sbi->layout.group_width;
 523         } else {
 524                 if (sbi->layout.group_depth) {
 525                         printk(KERN_NOTICE "Warning: group_depth ignored "
 526                                 "group_width == 0 && group_depth == %lld\n",
 527                                 _LLU(sbi->layout.group_depth));
 528                 }
 529                 sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1;
 530                 sbi->layout.group_depth = -1;
 531                 sbi->layout.group_count = 1;
 532         }
 533
 534         stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
 535         if (stripe_length >= (1ULL << 32)) {
 536                 EXOFS_ERR("Total Stripe length(0x%llx)"
 537                           " >= 32bit is not supported\n", _LLU(stripe_length));
 538                 return -EINVAL;
 539         }
 540
 541         EXOFS_DBGMSG("exofs: layout: "
 542                 "num_comps=%u stripe_unit=0x%x group_width=%u "
 543                 "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
 544                 numdevs,
 545                 sbi->layout.stripe_unit,
 546                 sbi->layout.group_width,
 547                 _LLU(sbi->layout.group_depth),
 548                 sbi->layout.mirrors_p1,
 549                 sbi->layout.raid_algorithm);
 550         return 0;
 551 }
 552
 553 static unsigned __ra_pages(struct ore_layout *layout)
 554 {
 555         const unsigned _MIN_RA = 32; /* min 128K read-ahead */
 556         unsigned ra_pages = layout->group_width * layout->stripe_unit /
 557                                 PAGE_SIZE;
 558         unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
 559
 560         ra_pages *= 2; /* two stripes */
 561         if (ra_pages < _MIN_RA)
 562                 ra_pages = roundup(_MIN_RA, ra_pages / 2);
 563
 564         if (ra_pages > max_io_pages)
 565                 ra_pages = max_io_pages;
 566
 567         return ra_pages;
 568 }
 569
 570 /* @odi is valid only as long as @fscb_dev is valid */
 571 static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 572                              struct osd_dev_info *odi)
 573 {
 574         odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
 575         memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
 576
 577         odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
 578         odi->osdname = dt_dev->osdname;
 579
 580         /* FIXME support long names. Will need a _put function */
 581         if (dt_dev->long_name_offset)
 582                 return -EINVAL;
 583
 584         /* Make sure osdname is printable!
 585          * mkexofs should give us space for a null-terminator else the
 586          * device-table is invalid.
 587          */
 588         if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
 589                 odi->osdname_len = sizeof(dt_dev->osdname) - 1;
 590         dt_dev->osdname[odi->osdname_len] = 0;
 591
 592         /* If it's all zeros something is bad we read past end-of-obj */
 593         return !(odi->systemid_len || odi->osdname_len);
 594 }
 595
 596 static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 597                                        struct osd_dev *fscb_od,
 598                                        unsigned table_count)
 599 {
 600         struct ore_comp comp;
 601         struct exofs_device_table *dt;
 602         unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
 603                                              sizeof(*dt);
 604         unsigned numdevs, i;
 605         int ret;
 606
 607         dt = kmalloc(table_bytes, GFP_KERNEL);
 608         if (unlikely(!dt)) {
 609                 EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
 610                           table_bytes);
 611                 return -ENOMEM;
 612         }
 613
 614         sbi->comps.numdevs = 0;
 615
 616         comp.obj.partition = sbi->one_comp.obj.partition;
 617         comp.obj.id = EXOFS_DEVTABLE_ID;
 618         exofs_make_credential(comp.cred, &comp.obj);
 619
 620         ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
 621                               table_bytes);
 622         if (unlikely(ret)) {
 623                 EXOFS_ERR("ERROR: reading device table\n");
 624                 goto out;
 625         }
 626
 627         numdevs = le64_to_cpu(dt->dt_num_devices);
 628         if (unlikely(!numdevs)) {
 629                 ret = -EINVAL;
 630                 goto out;
 631         }
 632         WARN_ON(table_count != numdevs);
 633
 634         ret = _read_and_match_data_map(sbi, numdevs, dt);
 635         if (unlikely(ret))
 636                 goto out;
 637
 638         if (likely(numdevs > 1)) {
 639                 unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
 640
 641                 /* Twice bigger table: See exofs_init_comps() and below
 642                  * comment
 643                  */
 644                 sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
 645                 if (unlikely(!sbi->comps.ods)) {
 646                         EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
 647                                   numdevs);
 648                         ret = -ENOMEM;
 649                         goto out;
 650                 }
 651         }
 652
 653         for (i = 0; i < numdevs; i++) {
 654                 struct exofs_fscb fscb;
 655                 struct osd_dev_info odi;
 656                 struct osd_dev *od;
 657
 658                 if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
 659                         EXOFS_ERR("ERROR: Read all-zeros device entry\n");
 660                         ret = -EINVAL;
 661                         goto out;
 662                 }
 663
 664                 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
 665                        i, odi.osdname);
 666
 667                 /* On all devices the device table is identical. The user can
 668                  * specify any one of the participating devices on the command
 669                  * line. We always keep them in device-table order.
 670                  */
 671                 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
 672                         sbi->comps.ods[i] = fscb_od;
 673                         ++sbi->comps.numdevs;
 674                         fscb_od = NULL;
 675                         continue;
 676                 }
 677
 678                 od = osduld_info_lookup(&odi);
 679                 if (IS_ERR(od)) {
 680                         ret = PTR_ERR(od);
 681                         EXOFS_ERR("ERROR: device requested is not found "
 682                                   "osd_name-%s =>%d\n", odi.osdname, ret);
 683                         goto out;
 684                 }
 685
 686                 sbi->comps.ods[i] = od;
 687                 ++sbi->comps.numdevs;
 688
 689                 /* Read the fscb of the other devices to make sure the FS
 690                  * partition is there.
 691                  */
 692                 ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
 693                                       sizeof(fscb));
 694                 if (unlikely(ret)) {
 695                         EXOFS_ERR("ERROR: Malformed participating device "
 696                                   "error reading fscb osd_name-%s\n",
 697                                   odi.osdname);
 698                         goto out;
 699                 }
 700
 701                 /* TODO: verify other information is correct and FS-uuid
 702                  *       matches. Benny what did you say about device table
 703                  *       generation and old devices?
 704                  */
 705         }
 706
 707 out:
 708         kfree(dt);
 709         if (likely(!ret)) {
 710                 unsigned numdevs = sbi->comps.numdevs;
 711
 712                 if (unlikely(fscb_od)) {
 713                         EXOFS_ERR("ERROR: Bad device-table container device not present\n");
 714                         osduld_put_device(fscb_od);
 715                         return -EINVAL;
 716                 }
 717                 /* exofs round-robins the device table view according to inode
 718                  * number. We hold a: twice bigger table hence inodes can point
 719                  * to any device and have a sequential view of the table
 720                  * starting at this device. See exofs_init_comps()
 721                  */
 722                 for (i = 0; i < numdevs - 1; ++i)
 723                         sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
 724         }
 725         return ret;
 726 }
 727
 728 /*
 729  * Read the superblock from the OSD and fill in the fields
 730  */
 731 static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 732 {
 733         struct inode *root;
 734         struct exofs_mountopt *opts = data;
 735         struct exofs_sb_info *sbi;      /*extended info                  */
 736         struct osd_dev *od;             /* Master device                 */
 737         struct exofs_fscb fscb;         /*on-disk superblock info        */
 738         struct ore_comp comp;
 739         unsigned table_count;
 740         int ret;
 741
 742         sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 743         if (!sbi)
 744                 return -ENOMEM;
 745
 746         /* use mount options to fill superblock */
 747         if (opts->is_osdname) {
 748                 struct osd_dev_info odi = {.systemid_len = 0};
 749
 750                 odi.osdname_len = strlen(opts->dev_name);
 751                 odi.osdname = (u8 *)opts->dev_name;
 752                 od = osduld_info_lookup(&odi);
 753                 kfree(opts->dev_name);
 754                 opts->dev_name = NULL;
 755         } else {
 756                 od = osduld_path_lookup(opts->dev_name);
 757         }
 758         if (IS_ERR(od)) {
 759                 ret = -EINVAL;
 760                 goto free_sbi;
 761         }
 762
 763         /* Default layout in case we do not have a device-table */
 764         sbi->layout.stripe_unit = PAGE_SIZE;
 765         sbi->layout.mirrors_p1 = 1;
 766         sbi->layout.group_width = 1;
 767         sbi->layout.group_depth = -1;
 768         sbi->layout.group_count = 1;
 769         sbi->s_timeout = opts->timeout;
 770
 771         sbi->one_comp.obj.partition = opts->pid;
 772         sbi->one_comp.obj.id = 0;
 773         exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
 774         sbi->comps.numdevs = 1;
 775         sbi->comps.single_comp = EC_SINGLE_COMP;
 776         sbi->comps.comps = &sbi->one_comp;
 777         sbi->comps.ods = sbi->_min_one_dev;
 778
 779         /* fill in some other data by hand */
 780         memset(sb->s_id, 0, sizeof(sb->s_id));
 781         strcpy(sb->s_id, "exofs");
 782         sb->s_blocksize = EXOFS_BLKSIZE;
 783         sb->s_blocksize_bits = EXOFS_BLKSHIFT;
 784         sb->s_maxbytes = MAX_LFS_FILESIZE;
 785         atomic_set(&sbi->s_curr_pending, 0);
 786         sb->s_bdev = NULL;
 787         sb->s_dev = 0;
 788
 789         comp.obj.partition = sbi->one_comp.obj.partition;
 790         comp.obj.id = EXOFS_SUPER_ID;
 791         exofs_make_credential(comp.cred, &comp.obj);
 792
 793         ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
 794         if (unlikely(ret))
 795                 goto free_sbi;
 796
 797         sb->s_magic = le16_to_cpu(fscb.s_magic);
 798         /* NOTE: we read below to be backward compatible with old versions */
 799         sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
 800         sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
 801
 802         /* make sure what we read from the object store is correct */
 803         if (sb->s_magic != EXOFS_SUPER_MAGIC) {
 804                 if (!silent)
 805                         EXOFS_ERR("ERROR: Bad magic value\n");
 806                 ret = -EINVAL;
 807                 goto free_sbi;
 808         }
 809         if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
 810                 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
 811                           EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
 812                 ret = -EINVAL;
 813                 goto free_sbi;
 814         }
 815
 816         /* start generation numbers from a random point */
 817         get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 818         spin_lock_init(&sbi->s_next_gen_lock);
 819
 820         table_count = le64_to_cpu(fscb.s_dev_table_count);
 821         if (table_count) {
 822                 ret = exofs_read_lookup_dev_table(sbi, od, table_count);
 823                 if (unlikely(ret))
 824                         goto free_sbi;
 825         } else {
 826                 sbi->comps.ods[0] = od;
 827         }
 828
 829         __sbi_read_stats(sbi);
 830
 831         /* set up operation vectors */
 832         sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
 833         sb->s_bdi = &sbi->bdi;
 834         sb->s_fs_info = sbi;
 835         sb->s_op = &exofs_sops;
 836         sb->s_export_op = &exofs_export_ops;
 837         root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
 838         if (IS_ERR(root)) {
 839                 EXOFS_ERR("ERROR: exofs_iget failed\n");
 840                 ret = PTR_ERR(root);
 841                 goto free_sbi;
 842         }
 843         sb->s_root = d_alloc_root(root);
 844         if (!sb->s_root) {
 845                 iput(root);
 846                 EXOFS_ERR("ERROR: get root inode failed\n");
 847                 ret = -ENOMEM;
 848                 goto free_sbi;
 849         }
 850
 851         if (!S_ISDIR(root->i_mode)) {
 852                 dput(sb->s_root);
 853                 sb->s_root = NULL;
 854                 EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
 855                        root->i_mode);
 856                 ret = -EINVAL;
 857                 goto free_sbi;
 858         }
 859
 860         ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
 861         if (ret) {
 862                 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
 863                 goto free_sbi;
 864         }
 865
 866         _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
 867                             sbi->one_comp.obj.partition);
 868         return 0;
 869
 870 free_sbi:
 871         EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
 872                   opts->dev_name, sbi->one_comp.obj.partition, ret);
 873         exofs_free_sbi(sbi);
 874         return ret;
 875 }
 876
 877 /*
 878  * Set up the superblock (calls exofs_fill_super eventually)
 879  */
 880 static struct dentry *exofs_mount(struct file_system_type *type,
 881                           int flags, const char *dev_name,
 882                           void *data)
 883 {
 884         struct exofs_mountopt opts;
 885         int ret;
 886
 887         ret = parse_options(data, &opts);
 888         if (ret)
 889                 return ERR_PTR(ret);
 890
 891         if (!opts.dev_name)
 892                 opts.dev_name = dev_name;
 893         return mount_nodev(type, flags, &opts, exofs_fill_super);
 894 }
 895
 896 /*
 897  * Return information about the file system state in the buffer.  This is used
 898  * by the 'df' command, for example.
 899  */
 900 static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 901 {
 902         struct super_block *sb = dentry->d_sb;
 903         struct exofs_sb_info *sbi = sb->s_fs_info;
 904         struct ore_io_state *ios;
 905         struct osd_attr attrs[] = {
 906                 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
 907                         OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
 908                 ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
 909                         OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
 910         };
 911         uint64_t capacity = ULLONG_MAX;
 912         uint64_t used = ULLONG_MAX;
 913         int ret;
 914
 915         ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
 916         if (ret) {
 917                 EXOFS_DBGMSG("ore_get_io_state failed.\n");
 918                 return ret;
 919         }
 920
 921         ios->in_attr = attrs;
 922         ios->in_attr_len = ARRAY_SIZE(attrs);
 923
 924         ret = ore_read(ios);
 925         if (unlikely(ret))
 926                 goto out;
 927
 928         ret = extract_attr_from_ios(ios, &attrs[0]);
 929         if (likely(!ret)) {
 930                 capacity = get_unaligned_be64(attrs[0].val_ptr);
 931                 if (unlikely(!capacity))
 932                         capacity = ULLONG_MAX;
 933         } else
 934                 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
 935
 936         ret = extract_attr_from_ios(ios, &attrs[1]);
 937         if (likely(!ret))
 938                 used = get_unaligned_be64(attrs[1].val_ptr);
 939         else
 940                 EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
 941
 942         /* fill in the stats buffer */
 943         buf->f_type = EXOFS_SUPER_MAGIC;
 944         buf->f_bsize = EXOFS_BLKSIZE;
 945         buf->f_blocks = capacity >> 9;
 946         buf->f_bfree = (capacity - used) >> 9;
 947         buf->f_bavail = buf->f_bfree;
 948         buf->f_files = sbi->s_numfiles;
 949         buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
 950         buf->f_namelen = EXOFS_NAME_LEN;
 951
 952 out:
 953         ore_put_io_state(ios);
 954         return ret;
 955 }
 956
 957 static const struct super_operations exofs_sops = {
 958         .alloc_inode    = exofs_alloc_inode,
 959         .destroy_inode  = exofs_destroy_inode,
 960         .write_inode    = exofs_write_inode,
 961         .evict_inode    = exofs_evict_inode,
 962         .put_super      = exofs_put_super,
 963         .write_super    = exofs_write_super,
 964         .sync_fs        = exofs_sync_fs,
 965         .statfs         = exofs_statfs,
 966 };
 967
 968 /******************************************************************************
 969  * EXPORT OPERATIONS
 970  *****************************************************************************/
 971
 972 struct dentry *exofs_get_parent(struct dentry *child)
 973 {
 974         unsigned long ino = exofs_parent_ino(child);
 975
 976         if (!ino)
 977                 return ERR_PTR(-ESTALE);
 978
 979         return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
 980 }
 981
 982 static struct inode *exofs_nfs_get_inode(struct super_block *sb,
 983                 u64 ino, u32 generation)
 984 {
 985         struct inode *inode;
 986
 987         inode = exofs_iget(sb, ino);
 988         if (IS_ERR(inode))
 989                 return ERR_CAST(inode);
 990         if (generation && inode->i_generation != generation) {
 991                 /* we didn't find the right inode.. */
 992                 iput(inode);
 993                 return ERR_PTR(-ESTALE);
 994         }
 995         return inode;
 996 }
 997
 998 static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
 999                                 struct fid *fid, int fh_len, int fh_type)
1000 {
1001         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1002                                     exofs_nfs_get_inode);
1003 }
1004
1005 static struct dentry *exofs_fh_to_parent(struct super_block *sb,
1006                                 struct fid *fid, int fh_len, int fh_type)
1007 {
1008         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1009                                     exofs_nfs_get_inode);
1010 }
1011
1012 static const struct export_operations exofs_export_ops = {
1013         .fh_to_dentry = exofs_fh_to_dentry,
1014         .fh_to_parent = exofs_fh_to_parent,
1015         .get_parent = exofs_get_parent,
1016 };
1017
1018 /******************************************************************************
1019  * INSMOD/RMMOD
1020  *****************************************************************************/
1021
1022 /*
1023  * struct that describes this file system
1024  */
1025 static struct file_system_type exofs_type = {
1026         .owner          = THIS_MODULE,
1027         .name           = "exofs",
1028         .mount          = exofs_mount,
1029         .kill_sb        = generic_shutdown_super,
1030 };
1031
1032 static int __init init_exofs(void)
1033 {
1034         int err;
1035
1036         err = init_inodecache();
1037         if (err)
1038                 goto out;
1039
1040         err = register_filesystem(&exofs_type);
1041         if (err)
1042                 goto out_d;
1043
1044         return 0;
1045 out_d:
1046         destroy_inodecache();
1047 out:
1048         return err;
1049 }
1050
1051 static void __exit exit_exofs(void)
1052 {
1053         unregister_filesystem(&exofs_type);
1054         destroy_inodecache();
1055 }
1056
1057 MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
1058 MODULE_DESCRIPTION("exofs");
1059 MODULE_LICENSE("GPL");
1060
1061 module_init(init_exofs)
1062 module_exit(exit_exofs)