fs/btrfs/zlib.c

   1 /*
   2  * Copyright (C) 2008 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  *
  18  * Based on jffs2 zlib code:
  19  * Copyright © 2001-2007 Red Hat, Inc.
  20  * Created by David Woodhouse <dwmw2@infradead.org>
  21  */
  22
  23 #include <linux/kernel.h>
  24 #include <linux/slab.h>
  25 #include <linux/zlib.h>
  26 #include <linux/zutil.h>
  27 #include <linux/vmalloc.h>
  28 #include <linux/init.h>
  29 #include <linux/err.h>
  30 #include <linux/sched.h>
  31 #include <linux/pagemap.h>
  32 #include <linux/bio.h>
  33 #include "compression.h"
  34
  35 /* Plan: call deflate() with avail_in == *sourcelen,
  36         avail_out = *dstlen - 12 and flush == Z_FINISH.
  37         If it doesn't manage to finish, call it again with
  38         avail_in == 0 and avail_out set to the remaining 12
  39         bytes for it to clean up.
  40    Q: Is 12 bytes sufficient?
  41 */
  42 #define STREAM_END_SPACE 12
  43
  44 struct workspace {
  45         z_stream inf_strm;
  46         z_stream def_strm;
  47         char *buf;
  48         struct list_head list;
  49 };
  50
  51 static LIST_HEAD(idle_workspace);
  52 static DEFINE_SPINLOCK(workspace_lock);
  53 static unsigned long num_workspace;
  54 static atomic_t alloc_workspace = ATOMIC_INIT(0);
  55 static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
  56
  57 /*
  58  * this finds an available zlib workspace or allocates a new one
  59  * NULL or an ERR_PTR is returned if things go bad.
  60  */
  61 static struct workspace *find_zlib_workspace(void)
  62 {
  63         struct workspace *workspace;
  64         int ret;
  65         int cpus = num_online_cpus();
  66
  67 again:
  68         spin_lock(&workspace_lock);
  69         if (!list_empty(&idle_workspace)) {
  70                 workspace = list_entry(idle_workspace.next, struct workspace,
  71                                        list);
  72                 list_del(&workspace->list);
  73                 num_workspace--;
  74                 spin_unlock(&workspace_lock);
  75                 return workspace;
  76
  77         }
  78         spin_unlock(&workspace_lock);
  79         if (atomic_read(&alloc_workspace) > cpus) {
  80                 DEFINE_WAIT(wait);
  81                 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
  82                 if (atomic_read(&alloc_workspace) > cpus)
  83                         schedule();
  84                 finish_wait(&workspace_wait, &wait);
  85                 goto again;
  86         }
  87         atomic_inc(&alloc_workspace);
  88         workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
  89         if (!workspace) {
  90                 ret = -ENOMEM;
  91                 goto fail;
  92         }
  93
  94         workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
  95         if (!workspace->def_strm.workspace) {
  96                 ret = -ENOMEM;
  97                 goto fail;
  98         }
  99         workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
 100         if (!workspace->inf_strm.workspace) {
 101                 ret = -ENOMEM;
 102                 goto fail_inflate;
 103         }
 104         workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
 105         if (!workspace->buf) {
 106                 ret = -ENOMEM;
 107                 goto fail_kmalloc;
 108         }
 109         return workspace;
 110
 111 fail_kmalloc:
 112         vfree(workspace->inf_strm.workspace);
 113 fail_inflate:
 114         vfree(workspace->def_strm.workspace);
 115 fail:
 116         kfree(workspace);
 117         atomic_dec(&alloc_workspace);
 118         wake_up(&workspace_wait);
 119         return ERR_PTR(ret);
 120 }
 121
 122 /*
 123  * put a workspace struct back on the list or free it if we have enough
 124  * idle ones sitting around
 125  */
 126 static int free_workspace(struct workspace *workspace)
 127 {
 128         spin_lock(&workspace_lock);
 129         if (num_workspace < num_online_cpus()) {
 130                 list_add_tail(&workspace->list, &idle_workspace);
 131                 num_workspace++;
 132                 spin_unlock(&workspace_lock);
 133                 if (waitqueue_active(&workspace_wait))
 134                         wake_up(&workspace_wait);
 135                 return 0;
 136         }
 137         spin_unlock(&workspace_lock);
 138         vfree(workspace->def_strm.workspace);
 139         vfree(workspace->inf_strm.workspace);
 140         kfree(workspace->buf);
 141         kfree(workspace);
 142
 143         atomic_dec(&alloc_workspace);
 144         if (waitqueue_active(&workspace_wait))
 145                 wake_up(&workspace_wait);
 146         return 0;
 147 }
 148
 149 /*
 150  * cleanup function for module exit
 151  */
 152 static void free_workspaces(void)
 153 {
 154         struct workspace *workspace;
 155         while (!list_empty(&idle_workspace)) {
 156                 workspace = list_entry(idle_workspace.next, struct workspace,
 157                                        list);
 158                 list_del(&workspace->list);
 159                 vfree(workspace->def_strm.workspace);
 160                 vfree(workspace->inf_strm.workspace);
 161                 kfree(workspace->buf);
 162                 kfree(workspace);
 163                 atomic_dec(&alloc_workspace);
 164         }
 165 }
 166
 167 /*
 168  * given an address space and start/len, compress the bytes.
 169  *
 170  * pages are allocated to hold the compressed result and stored
 171  * in 'pages'
 172  *
 173  * out_pages is used to return the number of pages allocated.  There
 174  * may be pages allocated even if we return an error
 175  *
 176  * total_in is used to return the number of bytes actually read.  It
 177  * may be smaller then len if we had to exit early because we
 178  * ran out of room in the pages array or because we cross the
 179  * max_out threshold.
 180  *
 181  * total_out is used to return the total number of compressed bytes
 182  *
 183  * max_out tells us the max number of bytes that we're allowed to
 184  * stuff into pages
 185  */
 186 int btrfs_zlib_compress_pages(struct address_space *mapping,
 187                               u64 start, unsigned long len,
 188                               struct page **pages,
 189                               unsigned long nr_dest_pages,
 190                               unsigned long *out_pages,
 191                               unsigned long *total_in,
 192                               unsigned long *total_out,
 193                               unsigned long max_out)
 194 {
 195         int ret;
 196         struct workspace *workspace;
 197         char *data_in;
 198         char *cpage_out;
 199         int nr_pages = 0;
 200         struct page *in_page = NULL;
 201         struct page *out_page = NULL;
 202         unsigned long bytes_left;
 203
 204         *out_pages = 0;
 205         *total_out = 0;
 206         *total_in = 0;
 207
 208         workspace = find_zlib_workspace();
 209         if (IS_ERR(workspace))
 210                 return -1;
 211
 212         if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
 213                 printk(KERN_WARNING "deflateInit failed\n");
 214                 ret = -1;
 215                 goto out;
 216         }
 217
 218         workspace->def_strm.total_in = 0;
 219         workspace->def_strm.total_out = 0;
 220
 221         in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
 222         data_in = kmap(in_page);
 223
 224         out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 225         cpage_out = kmap(out_page);
 226         pages[0] = out_page;
 227         nr_pages = 1;
 228
 229         workspace->def_strm.next_in = data_in;
 230         workspace->def_strm.next_out = cpage_out;
 231         workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
 232         workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
 233
 234         while (workspace->def_strm.total_in < len) {
 235                 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 236                 if (ret != Z_OK) {
 237                         printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
 238                                ret);
 239                         zlib_deflateEnd(&workspace->def_strm);
 240                         ret = -1;
 241                         goto out;
 242                 }
 243
 244                 /* we're making it bigger, give up */
 245                 if (workspace->def_strm.total_in > 8192 &&
 246                     workspace->def_strm.total_in <
 247                     workspace->def_strm.total_out) {
 248                         ret = -1;
 249                         goto out;
 250                 }
 251                 /* we need another page for writing out.  Test this
 252                  * before the total_in so we will pull in a new page for
 253                  * the stream end if required
 254                  */
 255                 if (workspace->def_strm.avail_out == 0) {
 256                         kunmap(out_page);
 257                         if (nr_pages == nr_dest_pages) {
 258                                 out_page = NULL;
 259                                 ret = -1;
 260                                 goto out;
 261                         }
 262                         out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 263                         cpage_out = kmap(out_page);
 264                         pages[nr_pages] = out_page;
 265                         nr_pages++;
 266                         workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
 267                         workspace->def_strm.next_out = cpage_out;
 268                 }
 269                 /* we're all done */
 270                 if (workspace->def_strm.total_in >= len)
 271                         break;
 272
 273                 /* we've read in a full page, get a new one */
 274                 if (workspace->def_strm.avail_in == 0) {
 275                         if (workspace->def_strm.total_out > max_out)
 276                                 break;
 277
 278                         bytes_left = len - workspace->def_strm.total_in;
 279                         kunmap(in_page);
 280                         page_cache_release(in_page);
 281
 282                         start += PAGE_CACHE_SIZE;
 283                         in_page = find_get_page(mapping,
 284                                                 start >> PAGE_CACHE_SHIFT);
 285                         data_in = kmap(in_page);
 286                         workspace->def_strm.avail_in = min(bytes_left,
 287                                                            PAGE_CACHE_SIZE);
 288                         workspace->def_strm.next_in = data_in;
 289                 }
 290         }
 291         workspace->def_strm.avail_in = 0;
 292         ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
 293         zlib_deflateEnd(&workspace->def_strm);
 294
 295         if (ret != Z_STREAM_END) {
 296                 ret = -1;
 297                 goto out;
 298         }
 299
 300         if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
 301                 ret = -1;
 302                 goto out;
 303         }
 304
 305         ret = 0;
 306         *total_out = workspace->def_strm.total_out;
 307         *total_in = workspace->def_strm.total_in;
 308 out:
 309         *out_pages = nr_pages;
 310         if (out_page)
 311                 kunmap(out_page);
 312
 313         if (in_page) {
 314                 kunmap(in_page);
 315                 page_cache_release(in_page);
 316         }
 317         free_workspace(workspace);
 318         return ret;
 319 }
 320
 321 /*
 322  * pages_in is an array of pages with compressed data.
 323  *
 324  * disk_start is the starting logical offset of this array in the file
 325  *
 326  * bvec is a bio_vec of pages from the file that we want to decompress into
 327  *
 328  * vcnt is the count of pages in the biovec
 329  *
 330  * srclen is the number of bytes in pages_in
 331  *
 332  * The basic idea is that we have a bio that was created by readpages.
 333  * The pages in the bio are for the uncompressed data, and they may not
 334  * be contiguous.  They all correspond to the range of bytes covered by
 335  * the compressed extent.
 336  */
 337 int btrfs_zlib_decompress_biovec(struct page **pages_in,
 338                               u64 disk_start,
 339                               struct bio_vec *bvec,
 340                               int vcnt,
 341                               size_t srclen)
 342 {
 343         int ret = 0;
 344         int wbits = MAX_WBITS;
 345         struct workspace *workspace;
 346         char *data_in;
 347         size_t total_out = 0;
 348         unsigned long page_bytes_left;
 349         unsigned long page_in_index = 0;
 350         unsigned long page_out_index = 0;
 351         struct page *page_out;
 352         unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
 353                                         PAGE_CACHE_SIZE;
 354         unsigned long buf_start;
 355         unsigned long buf_offset;
 356         unsigned long bytes;
 357         unsigned long working_bytes;
 358         unsigned long pg_offset;
 359         unsigned long start_byte;
 360         unsigned long current_buf_start;
 361         char *kaddr;
 362
 363         workspace = find_zlib_workspace();
 364         if (IS_ERR(workspace))
 365                 return -ENOMEM;
 366
 367         data_in = kmap(pages_in[page_in_index]);
 368         workspace->inf_strm.next_in = data_in;
 369         workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
 370         workspace->inf_strm.total_in = 0;
 371
 372         workspace->inf_strm.total_out = 0;
 373         workspace->inf_strm.next_out = workspace->buf;
 374         workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 375         page_out = bvec[page_out_index].bv_page;
 376         page_bytes_left = PAGE_CACHE_SIZE;
 377         pg_offset = 0;
 378
 379         /* If it's deflate, and it's got no preset dictionary, then
 380            we can tell zlib to skip the adler32 check. */
 381         if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
 382             ((data_in[0] & 0x0f) == Z_DEFLATED) &&
 383             !(((data_in[0]<<8) + data_in[1]) % 31)) {
 384
 385                 wbits = -((data_in[0] >> 4) + 8);
 386                 workspace->inf_strm.next_in += 2;
 387                 workspace->inf_strm.avail_in -= 2;
 388         }
 389
 390         if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
 391                 printk(KERN_WARNING "inflateInit failed\n");
 392                 ret = -1;
 393                 goto out;
 394         }
 395         while (workspace->inf_strm.total_in < srclen) {
 396                 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
 397                 if (ret != Z_OK && ret != Z_STREAM_END)
 398                         break;
 399                 /*
 400                  * buf start is the byte offset we're of the start of
 401                  * our workspace buffer
 402                  */
 403                 buf_start = total_out;
 404
 405                 /* total_out is the last byte of the workspace buffer */
 406                 total_out = workspace->inf_strm.total_out;
 407
 408                 working_bytes = total_out - buf_start;
 409
 410                 /*
 411                  * start byte is the first byte of the page we're currently
 412                  * copying into relative to the start of the compressed data.
 413                  */
 414                 start_byte = page_offset(page_out) - disk_start;
 415
 416                 if (working_bytes == 0) {
 417                         /* we didn't make progress in this inflate
 418                          * call, we're done
 419                          */
 420                         if (ret != Z_STREAM_END)
 421                                 ret = -1;
 422                         break;
 423                 }
 424
 425                 /* we haven't yet hit data corresponding to this page */
 426                 if (total_out <= start_byte)
 427                         goto next;
 428
 429                 /*
 430                  * the start of the data we care about is offset into
 431                  * the middle of our working buffer
 432                  */
 433                 if (total_out > start_byte && buf_start < start_byte) {
 434                         buf_offset = start_byte - buf_start;
 435                         working_bytes -= buf_offset;
 436                 } else {
 437                         buf_offset = 0;
 438                 }
 439                 current_buf_start = buf_start;
 440
 441                 /* copy bytes from the working buffer into the pages */
 442                 while (working_bytes > 0) {
 443                         bytes = min(PAGE_CACHE_SIZE - pg_offset,
 444                                     PAGE_CACHE_SIZE - buf_offset);
 445                         bytes = min(bytes, working_bytes);
 446                         kaddr = kmap_atomic(page_out, KM_USER0);
 447                         memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
 448                                bytes);
 449                         kunmap_atomic(kaddr, KM_USER0);
 450                         flush_dcache_page(page_out);
 451
 452                         pg_offset += bytes;
 453                         page_bytes_left -= bytes;
 454                         buf_offset += bytes;
 455                         working_bytes -= bytes;
 456                         current_buf_start += bytes;
 457
 458                         /* check if we need to pick another page */
 459                         if (page_bytes_left == 0) {
 460                                 page_out_index++;
 461                                 if (page_out_index >= vcnt) {
 462                                         ret = 0;
 463                                         goto done;
 464                                 }
 465
 466                                 page_out = bvec[page_out_index].bv_page;
 467                                 pg_offset = 0;
 468                                 page_bytes_left = PAGE_CACHE_SIZE;
 469                                 start_byte = page_offset(page_out) - disk_start;
 470
 471                                 /*
 472                                  * make sure our new page is covered by this
 473                                  * working buffer
 474                                  */
 475                                 if (total_out <= start_byte)
 476                                         goto next;
 477
 478                                 /* the next page in the biovec might not
 479                                  * be adjacent to the last page, but it
 480                                  * might still be found inside this working
 481                                  * buffer.  bump our offset pointer
 482                                  */
 483                                 if (total_out > start_byte &&
 484                                     current_buf_start < start_byte) {
 485                                         buf_offset = start_byte - buf_start;
 486                                         working_bytes = total_out - start_byte;
 487                                         current_buf_start = buf_start +
 488                                                 buf_offset;
 489                                 }
 490                         }
 491                 }
 492 next:
 493                 workspace->inf_strm.next_out = workspace->buf;
 494                 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 495
 496                 if (workspace->inf_strm.avail_in == 0) {
 497                         unsigned long tmp;
 498                         kunmap(pages_in[page_in_index]);
 499                         page_in_index++;
 500                         if (page_in_index >= total_pages_in) {
 501                                 data_in = NULL;
 502                                 break;
 503                         }
 504                         data_in = kmap(pages_in[page_in_index]);
 505                         workspace->inf_strm.next_in = data_in;
 506                         tmp = srclen - workspace->inf_strm.total_in;
 507                         workspace->inf_strm.avail_in = min(tmp,
 508                                                            PAGE_CACHE_SIZE);
 509                 }
 510         }
 511         if (ret != Z_STREAM_END)
 512                 ret = -1;
 513         else
 514                 ret = 0;
 515 done:
 516         zlib_inflateEnd(&workspace->inf_strm);
 517         if (data_in)
 518                 kunmap(pages_in[page_in_index]);
 519 out:
 520         free_workspace(workspace);
 521         return ret;
 522 }
 523
 524 /*
 525  * a less complex decompression routine.  Our compressed data fits in a
 526  * single page, and we want to read a single page out of it.
 527  * start_byte tells us the offset into the compressed data we're interested in
 528  */
 529 int btrfs_zlib_decompress(unsigned char *data_in,
 530                           struct page *dest_page,
 531                           unsigned long start_byte,
 532                           size_t srclen, size_t destlen)
 533 {
 534         int ret = 0;
 535         int wbits = MAX_WBITS;
 536         struct workspace *workspace;
 537         unsigned long bytes_left = destlen;
 538         unsigned long total_out = 0;
 539         char *kaddr;
 540
 541         if (destlen > PAGE_CACHE_SIZE)
 542                 return -ENOMEM;
 543
 544         workspace = find_zlib_workspace();
 545         if (IS_ERR(workspace))
 546                 return -ENOMEM;
 547
 548         workspace->inf_strm.next_in = data_in;
 549         workspace->inf_strm.avail_in = srclen;
 550         workspace->inf_strm.total_in = 0;
 551
 552         workspace->inf_strm.next_out = workspace->buf;
 553         workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 554         workspace->inf_strm.total_out = 0;
 555         /* If it's deflate, and it's got no preset dictionary, then
 556            we can tell zlib to skip the adler32 check. */
 557         if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
 558             ((data_in[0] & 0x0f) == Z_DEFLATED) &&
 559             !(((data_in[0]<<8) + data_in[1]) % 31)) {
 560
 561                 wbits = -((data_in[0] >> 4) + 8);
 562                 workspace->inf_strm.next_in += 2;
 563                 workspace->inf_strm.avail_in -= 2;
 564         }
 565
 566         if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
 567                 printk(KERN_WARNING "inflateInit failed\n");
 568                 ret = -1;
 569                 goto out;
 570         }
 571
 572         while (bytes_left > 0) {
 573                 unsigned long buf_start;
 574                 unsigned long buf_offset;
 575                 unsigned long bytes;
 576                 unsigned long pg_offset = 0;
 577
 578                 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
 579                 if (ret != Z_OK && ret != Z_STREAM_END)
 580                         break;
 581
 582                 buf_start = total_out;
 583                 total_out = workspace->inf_strm.total_out;
 584
 585                 if (total_out == buf_start) {
 586                         ret = -1;
 587                         break;
 588                 }
 589
 590                 if (total_out <= start_byte)
 591                         goto next;
 592
 593                 if (total_out > start_byte && buf_start < start_byte)
 594                         buf_offset = start_byte - buf_start;
 595                 else
 596                         buf_offset = 0;
 597
 598                 bytes = min(PAGE_CACHE_SIZE - pg_offset,
 599                             PAGE_CACHE_SIZE - buf_offset);
 600                 bytes = min(bytes, bytes_left);
 601
 602                 kaddr = kmap_atomic(dest_page, KM_USER0);
 603                 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
 604                 kunmap_atomic(kaddr, KM_USER0);
 605
 606                 pg_offset += bytes;
 607                 bytes_left -= bytes;
 608 next:
 609                 workspace->inf_strm.next_out = workspace->buf;
 610                 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 611         }
 612
 613         if (ret != Z_STREAM_END && bytes_left != 0)
 614                 ret = -1;
 615         else
 616                 ret = 0;
 617
 618         zlib_inflateEnd(&workspace->inf_strm);
 619 out:
 620         free_workspace(workspace);
 621         return ret;
 622 }
 623
 624 void btrfs_zlib_exit(void)
 625 {
 626     free_workspaces();
 627 }