lib/Support/Compressor.cpp

   1 //===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file was developed by Reid Spencer and is distributed under the
   6 // University of Illinois Open Source License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the llvm::Compressor class, an abstraction for memory
  11 // block compression.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "llvm/Config/config.h"
  16 #include "llvm/Support/Compressor.h"
  17 #include "llvm/ADT/StringExtras.h"
  18 #include <cassert>
  19 #include <string>
  20 #include <ostream>
  21 #include "bzip2/bzlib.h"
  22 using namespace llvm;
  23
  24 enum CompressionTypes {
  25   COMP_TYPE_NONE  = '0',
  26   COMP_TYPE_BZIP2 = '2'
  27 };
  28
  29 static int getdata(char*& buffer, size_t &size,
  30                    llvm::Compressor::OutputDataCallback* cb, void* context) {
  31   buffer = 0;
  32   size = 0;
  33   int result = (*cb)(buffer, size, context);
  34   assert(buffer != 0 && "Invalid result from Compressor callback");
  35   assert(size != 0 && "Invalid result from Compressor callback");
  36   return result;
  37 }
  38
  39 static int getdata_uns(char*& buffer, unsigned &size,
  40                        llvm::Compressor::OutputDataCallback* cb, void* context)
  41 {
  42   size_t SizeOut;
  43   int Res = getdata(buffer, SizeOut, cb, context);
  44   size = SizeOut;
  45   return Res;
  46 }
  47
  48 //===----------------------------------------------------------------------===//
  49 //=== NULLCOMP - a compression like set of routines that just copies data
  50 //===            without doing any compression. This is provided so that if the
  51 //===            configured environment doesn't have a compression library the
  52 //===            program can still work, albeit using more data/memory.
  53 //===----------------------------------------------------------------------===//
  54
  55 struct NULLCOMP_stream {
  56   // User provided fields
  57   char*  next_in;
  58   size_t avail_in;
  59   char*  next_out;
  60   size_t avail_out;
  61
  62   // Information fields
  63   size_t output_count; // Total count of output bytes
  64 };
  65
  66 static void NULLCOMP_init(NULLCOMP_stream* s) {
  67   s->output_count = 0;
  68 }
  69
  70 static bool NULLCOMP_compress(NULLCOMP_stream* s) {
  71   assert(s && "Invalid NULLCOMP_stream");
  72   assert(s->next_in != 0);
  73   assert(s->next_out != 0);
  74   assert(s->avail_in >= 1);
  75   assert(s->avail_out >= 1);
  76
  77   if (s->avail_out >= s->avail_in) {
  78     ::memcpy(s->next_out, s->next_in, s->avail_in);
  79     s->output_count += s->avail_in;
  80     s->avail_out -= s->avail_in;
  81     s->next_in += s->avail_in;
  82     s->avail_in = 0;
  83     return true;
  84   } else {
  85     ::memcpy(s->next_out, s->next_in, s->avail_out);
  86     s->output_count += s->avail_out;
  87     s->avail_in -= s->avail_out;
  88     s->next_in += s->avail_out;
  89     s->avail_out = 0;
  90     return false;
  91   }
  92 }
  93
  94 static bool NULLCOMP_decompress(NULLCOMP_stream* s) {
  95   assert(s && "Invalid NULLCOMP_stream");
  96   assert(s->next_in != 0);
  97   assert(s->next_out != 0);
  98   assert(s->avail_in >= 1);
  99   assert(s->avail_out >= 1);
 100
 101   if (s->avail_out >= s->avail_in) {
 102     ::memcpy(s->next_out, s->next_in, s->avail_in);
 103     s->output_count += s->avail_in;
 104     s->avail_out -= s->avail_in;
 105     s->next_in += s->avail_in;
 106     s->avail_in = 0;
 107     return true;
 108   } else {
 109     ::memcpy(s->next_out, s->next_in, s->avail_out);
 110     s->output_count += s->avail_out;
 111     s->avail_in -= s->avail_out;
 112     s->next_in += s->avail_out;
 113     s->avail_out = 0;
 114     return false;
 115   }
 116 }
 117
 118 static void NULLCOMP_end(NULLCOMP_stream* strm) {
 119 }
 120
 121 namespace {
 122
 123 /// This structure is only used when a bytecode file is compressed.
 124 /// As bytecode is being decompressed, the memory buffer might need
 125 /// to be reallocated. The buffer allocation is handled in a callback
 126 /// and this structure is needed to retain information across calls
 127 /// to the callback.
 128 /// @brief An internal buffer object used for handling decompression
 129 struct BufferContext {
 130   char* buff;
 131   size_t size;
 132   BufferContext(size_t compressedSize) {
 133     // Null to indicate malloc of a new block
 134     buff = 0;
 135
 136     // Compute the initial length of the uncompression buffer. Note that this
 137     // is twice the length of the compressed buffer and will be doubled again
 138     // in the callback for an initial allocation of 4x compressedSize.  This
 139     // calculation is based on the typical compression ratio of bzip2 on LLVM
 140     // bytecode files which typically ranges in the 50%-75% range.   Since we
 141     // typically get at least 50%, doubling is insufficient. By using a 4x
 142     // multiplier on the first allocation, we minimize the impact of having to
 143     // copy the buffer on reallocation.
 144     size = compressedSize*2;
 145   }
 146
 147   /// trimTo - Reduce the size of the buffer down to the specified amount.  This
 148   /// is useful after have read in the bytecode file to discard extra unused
 149   /// memory.
 150   ///
 151   void trimTo(size_t NewSize) {
 152     buff = (char*)::realloc(buff, NewSize);
 153     size = NewSize;
 154   }
 155
 156   /// This function handles allocation of the buffer used for decompression of
 157   /// compressed bytecode files. It is called by Compressor::decompress which is
 158   /// called by BytecodeReader::ParseBytecode.
 159   static size_t callback(char*&buff, size_t &sz, void* ctxt){
 160     // Case the context variable to our BufferContext
 161     BufferContext* bc = reinterpret_cast<BufferContext*>(ctxt);
 162
 163     // Compute the new, doubled, size of the block
 164     size_t new_size = bc->size * 2;
 165
 166     // Extend or allocate the block (realloc(0,n) == malloc(n))
 167     char* new_buff = (char*) ::realloc(bc->buff, new_size);
 168
 169     // Figure out what to return to the Compressor. If this is the first call,
 170     // then bc->buff will be null. In this case we want to return the entire
 171     // buffer because there was no previous allocation.  Otherwise, when the
 172     // buffer is reallocated, we save the new base pointer in the
 173     // BufferContext.buff field but return the address of only the extension,
 174     // mid-way through the buffer (since its size was doubled). Furthermore,
 175     // the sz result must be 1/2 the total size of the buffer.
 176     if (bc->buff == 0 ) {
 177       buff = bc->buff = new_buff;
 178       sz = new_size;
 179     } else {
 180       bc->buff = new_buff;
 181       buff = new_buff + bc->size;
 182       sz = bc->size;
 183     }
 184
 185     // Retain the size of the allocated block
 186     bc->size = new_size;
 187
 188     // Make sure we fail (return 1) if we didn't get any memory.
 189     return (bc->buff == 0 ? 1 : 0);
 190   }
 191 };
 192
 193 } // end anonymous namespace
 194
 195
 196 namespace {
 197
 198 // This structure retains the context when compressing the bytecode file. The
 199 // WriteCompressedData function below uses it to keep track of the previously
 200 // filled chunk of memory (which it writes) and how many bytes have been
 201 // written.
 202 struct WriterContext {
 203   // Initialize the context
 204   WriterContext(std::ostream*OS, size_t CS)
 205     : chunk(0), sz(0), written(0), compSize(CS), Out(OS) {}
 206
 207   // Make sure we clean up memory
 208   ~WriterContext() {
 209     if (chunk)
 210       delete [] chunk;
 211   }
 212
 213   // Write the chunk
 214   void write(size_t size = 0) {
 215     size_t write_size = (size == 0 ? sz : size);
 216     Out->write(chunk,write_size);
 217     written += write_size;
 218     delete [] chunk;
 219     chunk = 0;
 220     sz = 0;
 221   }
 222
 223   // This function is a callback used by the Compressor::compress function to
 224   // allocate memory for the compression buffer. This function fulfills that
 225   // responsibility but also writes the previous (now filled) buffer out to the
 226   // stream.
 227   static size_t callback(char*& buffer, size_t &size, void* context) {
 228     // Cast the context to the structure it must point to.
 229     WriterContext* ctxt = reinterpret_cast<WriterContext*>(context);
 230
 231     // If there's a previously allocated chunk, it must now be filled with
 232     // compressed data, so we write it out and deallocate it.
 233     if (ctxt->chunk != 0 && ctxt->sz > 0 ) {
 234       ctxt->write();
 235     }
 236
 237     // Compute the size of the next chunk to allocate. We attempt to allocate
 238     // enough memory to handle the compression in a single memory allocation. In
 239     // general, the worst we do on compression of bytecode is about 50% so we
 240     // conservatively estimate compSize / 2 as the size needed for the
 241     // compression buffer. compSize is the size of the compressed data, provided
 242     // by WriteBytecodeToFile.
 243     size = ctxt->sz = ctxt->compSize / 2;
 244
 245     // Allocate the chunks
 246     buffer = ctxt->chunk = new char [size];
 247
 248     // We must return 1 if the allocation failed so that the Compressor knows
 249     // not to use the buffer pointer.
 250     return (ctxt->chunk == 0 ? 1 : 0);
 251   }
 252
 253   char* chunk;       // pointer to the chunk of memory filled by compression
 254   size_t sz;         // size of chunk
 255   size_t written;    // aggregate total of bytes written in all chunks
 256   size_t compSize;   // size of the uncompressed buffer
 257   std::ostream* Out; // The stream we write the data to.
 258 };
 259
 260 }  // end anonymous namespace
 261
 262 // Compress in one of three ways
 263 size_t Compressor::compress(const char* in, size_t size,
 264                             OutputDataCallback* cb, void* context,
 265                             std::string* error ) {
 266   assert(in && "Can't compress null buffer");
 267   assert(size && "Can't compress empty buffer");
 268   assert(cb && "Can't compress without a callback function");
 269
 270   size_t result = 0;
 271
 272   // For small files, we just don't bother compressing. bzip2 isn't very good
 273   // with tiny files and can actually make the file larger, so we just avoid
 274   // it altogether.
 275   if (size > 64*1024) {
 276     // Set up the bz_stream
 277     bz_stream bzdata;
 278     bzdata.bzalloc = 0;
 279     bzdata.bzfree = 0;
 280     bzdata.opaque = 0;
 281     bzdata.next_in = (char*)in;
 282     bzdata.avail_in = size;
 283     bzdata.next_out = 0;
 284     bzdata.avail_out = 0;
 285     switch ( BZ2_bzCompressInit(&bzdata, 5, 0, 100) ) {
 286       case BZ_CONFIG_ERROR:
 287         if (error)
 288           *error = "bzip2 library mis-compiled";
 289         return result;
 290       case BZ_PARAM_ERROR:
 291         if (error)
 292           *error = "Compressor internal error";
 293         return result;
 294       case BZ_MEM_ERROR:
 295         if (error)
 296           *error = "Out of memory";
 297         return result;
 298       case BZ_OK:
 299       default:
 300         break;
 301     }
 302
 303     // Get a block of memory
 304     if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 305       BZ2_bzCompressEnd(&bzdata);
 306       if (error)
 307         *error = "Can't allocate output buffer";
 308       return result;
 309     }
 310
 311     // Put compression code in first byte
 312     (*bzdata.next_out++) = COMP_TYPE_BZIP2;
 313     bzdata.avail_out--;
 314
 315     // Compress it
 316     int bzerr = BZ_FINISH_OK;
 317     while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) {
 318       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 319         BZ2_bzCompressEnd(&bzdata);
 320         if (error)
 321           *error = "Can't allocate output buffer";
 322         return result;
 323       }
 324     }
 325     switch (bzerr) {
 326       case BZ_SEQUENCE_ERROR:
 327       case BZ_PARAM_ERROR:
 328         if (error)
 329           *error = "Param/Sequence error";
 330         return result;
 331       case BZ_FINISH_OK:
 332       case BZ_STREAM_END: break;
 333       default:
 334         if (error)
 335           *error = "BZip2 Error: " + utostr(unsigned(bzerr));
 336         return result;
 337     }
 338
 339     // Finish
 340     result = bzdata.total_out_lo32 + 1;
 341     if (sizeof(size_t) == sizeof(uint64_t))
 342       result |= static_cast<uint64_t>(bzdata.total_out_hi32) << 32;
 343
 344     BZ2_bzCompressEnd(&bzdata);
 345   } else {
 346     // Do null compression, for small files
 347     NULLCOMP_stream sdata;
 348     sdata.next_in = (char*)in;
 349     sdata.avail_in = size;
 350     NULLCOMP_init(&sdata);
 351
 352     if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 353       if (error)
 354         *error = "Can't allocate output buffer";
 355       return result;
 356     }
 357
 358     *(sdata.next_out++) = COMP_TYPE_NONE;
 359     sdata.avail_out--;
 360
 361     while (!NULLCOMP_compress(&sdata)) {
 362       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 363         if (error)
 364           *error = "Can't allocate output buffer";
 365         return result;
 366       }
 367     }
 368
 369     result = sdata.output_count + 1;
 370     NULLCOMP_end(&sdata);
 371   }
 372   return result;
 373 }
 374
 375 size_t Compressor::compressToNewBuffer(const char* in, size_t size, char*&out,
 376                                        std::string* error) {
 377   BufferContext bc(size);
 378   size_t result = compress(in,size,BufferContext::callback,(void*)&bc,error);
 379   bc.trimTo(result);
 380   out = bc.buff;
 381   return result;
 382 }
 383
 384 size_t
 385 Compressor::compressToStream(const char*in, size_t size, std::ostream& out,
 386                              std::string* error) {
 387   // Set up the context and writer
 388   WriterContext ctxt(&out, size / 2);
 389
 390   // Compress everything after the magic number (which we'll alter).
 391   size_t zipSize = Compressor::compress(in,size,
 392     WriterContext::callback, (void*)&ctxt,error);
 393
 394   if (zipSize && ctxt.chunk) {
 395     ctxt.write(zipSize - ctxt.written);
 396   }
 397   return zipSize;
 398 }
 399
 400 // Decompress in one of three ways
 401 size_t Compressor::decompress(const char *in, size_t size,
 402                               OutputDataCallback* cb, void* context,
 403                               std::string* error) {
 404   assert(in && "Can't decompress null buffer");
 405   assert(size > 1 && "Can't decompress empty buffer");
 406   assert(cb && "Can't decompress without a callback function");
 407
 408   size_t result = 0;
 409
 410   switch (*in++) {
 411     case COMP_TYPE_BZIP2: {
 412       // Set up the bz_stream
 413       bz_stream bzdata;
 414       bzdata.bzalloc = 0;
 415       bzdata.bzfree = 0;
 416       bzdata.opaque = 0;
 417       bzdata.next_in = (char*)in;
 418       bzdata.avail_in = size - 1;
 419       bzdata.next_out = 0;
 420       bzdata.avail_out = 0;
 421       switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) {
 422         case BZ_CONFIG_ERROR:
 423           if (error)
 424             *error = "bzip2 library mis-compiled";
 425           return result;
 426         case BZ_PARAM_ERROR:
 427           if (error)
 428             *error = "Compressor internal error";
 429           return result;
 430         case BZ_MEM_ERROR:
 431           if (error)
 432             *error = "Out of memory";
 433           return result;
 434         case BZ_OK:
 435         default:
 436           break;
 437       }
 438
 439       // Get a block of memory
 440       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 441         BZ2_bzDecompressEnd(&bzdata);
 442         if (error)
 443           *error = "Can't allocate output buffer";
 444         return result;
 445       }
 446
 447       // Decompress it
 448       int bzerr = BZ_OK;
 449       while ( BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata)) &&
 450               bzdata.avail_in != 0 ) {
 451         if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 452           BZ2_bzDecompressEnd(&bzdata);
 453           if (error)
 454             *error = "Can't allocate output buffer";
 455           return result;
 456         }
 457       }
 458
 459       switch (bzerr) {
 460           BZ2_bzDecompressEnd(&bzdata);
 461         case BZ_PARAM_ERROR:
 462           if (error)
 463             *error = "Compressor internal error";
 464           return result;
 465         case BZ_MEM_ERROR:
 466           BZ2_bzDecompressEnd(&bzdata);
 467           if (error)
 468             *error = "Out of memory";
 469           return result;
 470         case BZ_DATA_ERROR:
 471           BZ2_bzDecompressEnd(&bzdata);
 472           if (error)
 473             *error = "Data integrity error";
 474           return result;
 475         case BZ_DATA_ERROR_MAGIC:
 476           BZ2_bzDecompressEnd(&bzdata);
 477           if (error)
 478             *error = "Data is not BZIP2";
 479           return result;
 480         case BZ_OK:
 481           BZ2_bzDecompressEnd(&bzdata);
 482           if (error)
 483             *error = "Insufficient input for bzip2";
 484           return result;
 485         case BZ_STREAM_END: break;
 486         default:
 487           BZ2_bzDecompressEnd(&bzdata);
 488           if (error)
 489             *error = "Unknown result code from bzDecompress";
 490           return result;
 491       }
 492
 493       // Finish
 494       result = bzdata.total_out_lo32;
 495       if (sizeof(size_t) == sizeof(uint64_t))
 496         result |= (static_cast<uint64_t>(bzdata.total_out_hi32) << 32);
 497       BZ2_bzDecompressEnd(&bzdata);
 498       break;
 499     }
 500
 501     case COMP_TYPE_NONE: {
 502       NULLCOMP_stream sdata;
 503       sdata.next_in = (char*)in;
 504       sdata.avail_in = size - 1;
 505       NULLCOMP_init(&sdata);
 506
 507       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 508         if (error)
 509           *error = "Can't allocate output buffer";
 510         return result;
 511       }
 512
 513       while (!NULLCOMP_decompress(&sdata)) {
 514         if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 515           if (error)
 516             *error = "Can't allocate output buffer";
 517           return result;
 518         }
 519       }
 520
 521       result = sdata.output_count;
 522       NULLCOMP_end(&sdata);
 523       break;
 524     }
 525
 526     default:
 527       if (error)
 528         *error = "Unknown type of compressed data";
 529       return result;
 530   }
 531
 532   return result;
 533 }
 534
 535 size_t
 536 Compressor::decompressToNewBuffer(const char* in, size_t size, char*&out,
 537                                   std::string* error) {
 538   BufferContext bc(size);
 539   size_t result = decompress(in,size,BufferContext::callback,(void*)&bc,error);
 540   out = bc.buff;
 541   return result;
 542 }
 543
 544 size_t
 545 Compressor::decompressToStream(const char*in, size_t size, std::ostream& out,
 546                                std::string* error) {
 547   // Set up the context and writer
 548   WriterContext ctxt(&out,size / 2);
 549
 550   // Decompress everything after the magic number (which we'll alter)
 551   size_t zipSize = Compressor::decompress(in,size,
 552     WriterContext::callback, (void*)&ctxt,error);
 553
 554   if (zipSize && ctxt.chunk) {
 555     ctxt.write(zipSize - ctxt.written);
 556   }
 557   return zipSize;
 558 }