components/tiny_jpeg/tiny_jpeg.c

   1 #include "tiny_jpeg.h"
   2 #include <inttypes.h>
   3 #include <math.h>   // floorf, ceilf
   4 #include <string.h> // memcpy
   5
   6 #include "bsp_common.h"
   7 #ifdef assert
   8 #undef assert
   9 #endif
  10 #define assert(x) ASSERT(x)
  11 #define tjei_min(a, b) ((a) < b) ? (a) : (b);
  12 #define tjei_max(a, b) ((a) < b) ? (b) : (a);
  13
  14
  15 #if defined(_MSC_VER)
  16 #define TJEI_FORCE_INLINE __forceinline
  17 // #define TJEI_FORCE_INLINE __declspec(noinline)  // For profiling
  18 #else
  19 #define TJEI_FORCE_INLINE static // TODO: equivalent for gcc & clang
  20 #endif
  21
  22 // Only use zero for debugging and/or inspection.
  23 #define TJE_USE_FAST_DCT 1
  24
  25 // C std lib
  26
  27
  28 // ============================================================
  29 // Table definitions.
  30 //
  31 // The spec defines tjei_default reasonably good quantization matrices and huffman
  32 // specification tables.
  33 //
  34 //
  35 // Instead of hard-coding the final huffman table, we only hard-code the table
  36 // spec suggested by the specification, and then derive the full table from
  37 // there.  This is only for didactic purposes but it might be useful if there
  38 // ever is the case that we need to swap huffman tables from various sources.
  39 // ============================================================
  40
  41
  42 // K.1 - suggested luminance QT
  43 static const uint8_t tjei_default_qt_luma_from_spec[] =
  44 {
  45    16,11,10,16, 24, 40, 51, 61,
  46    12,12,14,19, 26, 58, 60, 55,
  47    14,13,16,24, 40, 57, 69, 56,
  48    14,17,22,29, 51, 87, 80, 62,
  49    18,22,37,56, 68,109,103, 77,
  50    24,35,55,64, 81,104,113, 92,
  51    49,64,78,87,103,121,120,101,
  52    72,92,95,98,112,100,103, 99,
  53 };
  54
  55 // Unused
  56 #if 0
  57 static const uint8_t tjei_default_qt_chroma_from_spec[] =
  58 {
  59     // K.1 - suggested chrominance QT
  60    17,18,24,47,99,99,99,99,
  61    18,21,26,66,99,99,99,99,
  62    24,26,56,99,99,99,99,99,
  63    47,66,99,99,99,99,99,99,
  64    99,99,99,99,99,99,99,99,
  65    99,99,99,99,99,99,99,99,
  66    99,99,99,99,99,99,99,99,
  67    99,99,99,99,99,99,99,99,
  68 };
  69 #endif
  70
  71 static const uint8_t tjei_default_qt_chroma_from_paper[] =
  72 {
  73     // Example QT from JPEG paper
  74     16,  12, 14,  14, 18, 24,  49,  72,
  75     11,  10, 16,  24, 40, 51,  61,  12,
  76     13,  17, 22,  35, 64, 92,  14,  16,
  77     22,  37, 55,  78, 95, 19,  24,  29,
  78     56,  64, 87,  98, 26, 40,  51,  68,
  79     81, 103, 112, 58, 57, 87,  109, 104,
  80     121,100, 60,  69, 80, 103, 113, 120,
  81     103, 55, 56,  62, 77, 92,  101, 99,
  82 };
  83
  84 // == Procedure to 'deflate' the huffman tree: JPEG spec, C.2
  85
  86 // Number of 16 bit values for every code length. (K.3.3.1)
  87 static const uint8_t tjei_default_ht_luma_dc_len[16] =
  88 {
  89     0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0
  90 };
  91 // values
  92 static const uint8_t tjei_default_ht_luma_dc[12] =
  93 {
  94     0,1,2,3,4,5,6,7,8,9,10,11
  95 };
  96
  97 // Number of 16 bit values for every code length. (K.3.3.1)
  98 static const uint8_t tjei_default_ht_chroma_dc_len[16] =
  99 {
 100     0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0
 101 };
 102 // values
 103 static const uint8_t tjei_default_ht_chroma_dc[12] =
 104 {
 105     0,1,2,3,4,5,6,7,8,9,10,11
 106 };
 107
 108 // Same as above, but AC coefficients.
 109 static const uint8_t tjei_default_ht_luma_ac_len[16] =
 110 {
 111     0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d
 112 };
 113 static const uint8_t tjei_default_ht_luma_ac[] =
 114 {
 115     0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
 116     0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, 0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0,
 117     0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28,
 118     0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
 119     0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
 120     0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
 121     0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
 122     0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
 123     0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2,
 124     0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
 125     0xF9, 0xFA
 126 };
 127
 128 static const uint8_t tjei_default_ht_chroma_ac_len[16] =
 129 {
 130     0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77
 131 };
 132 static const uint8_t tjei_default_ht_chroma_ac[] =
 133 {
 134     0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
 135     0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0,
 136     0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34, 0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26,
 137     0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
 138     0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
 139     0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 140     0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
 141     0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3,
 142     0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
 143     0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
 144     0xF9, 0xFA
 145 };
 146 static float aan_scales[] = {
 147     1.0f, 1.387039845f, 1.306562965f, 1.175875602f,
 148     1.0f, 0.785694958f, 0.541196100f, 0.275899379f
 149 };
 150
 151 // ============================================================
 152 // Code
 153 // ============================================================
 154
 155 // Zig-zag order:
 156 static const uint8_t tjei_zig_zag[64] =
 157 {
 158     0,   1,  5,  6, 14, 15, 27, 28,
 159     2,   4,  7, 13, 16, 26, 29, 42,
 160     3,   8, 12, 17, 25, 30, 41, 43,
 161     9,  11, 18, 24, 31, 40, 44, 53,
 162     10, 19, 23, 32, 39, 45, 52, 54,
 163     20, 22, 33, 38, 46, 51, 55, 60,
 164     21, 34, 37, 47, 50, 56, 59, 61,
 165     35, 36, 48, 49, 57, 58, 62, 63,
 166 };
 167 #define tjei_be_word BSP_Swap16
 168
 169 // ============================================================
 170 // The following structs exist only for code clarity, debugability, and
 171 // readability. They are used when writing to disk, but it is useful to have
 172 // 1-packed-structs to document how the format works, and to inspect memory
 173 // while developing.
 174 // ============================================================
 175
 176 static const uint8_t tjeik_jfif_id[] = "JFIF";
 177 static const uint8_t tjeik_com_str[] = "Created by JPEG Encoder";
 178
 179 // TODO: Get rid of packed structs!
 180 #pragma pack(push)
 181 #pragma pack(1)
 182 typedef struct
 183 {
 184     uint16_t SOI;
 185     // JFIF header.
 186     uint16_t APP0;
 187     uint16_t jfif_len;
 188     uint8_t  jfif_id[5];
 189     uint16_t version;
 190     uint8_t  units;
 191     uint16_t x_density;
 192     uint16_t y_density;
 193     uint8_t  x_thumb;
 194     uint8_t  y_thumb;
 195 } TJEJPEGHeader;
 196
 197 typedef struct
 198 {
 199     uint16_t com;
 200     uint16_t com_len;
 201     char     com_str[sizeof(tjeik_com_str) - 1];
 202 } TJEJPEGComment;
 203
 204 typedef struct
 205 {
 206     void*           context;
 207     tje_write_func* func;
 208 } TJEWriteContext;
 209
 210 typedef struct
 211 {
 212     // Huffman data.
 213     uint8_t         ehuffsize[4][257];
 214     uint16_t        ehuffcode[4][256];
 215     uint8_t const * ht_bits[4];
 216     uint8_t const * ht_vals[4];
 217
 218     // Cuantization tables.
 219     uint8_t         qt_luma[64];
 220     uint8_t         qt_chroma[64];
 221
 222     // fwrite by default. User-defined when using tje_encode_with_func.
 223     TJEWriteContext write_context;
 224
 225     // Buffered output. Big performance win when using the usual stdlib implementations.
 226     size_t          output_buffer_count;
 227     uint8_t         output_buffer[TJEI_BUFFER_SIZE];
 228 } TJEState;
 229
 230 // Helper struct for TJEFrameHeader (below).
 231 typedef struct
 232 {
 233     uint8_t  component_id;
 234     uint8_t  sampling_factors;    // most significant 4 bits: horizontal. 4 LSB: vertical (A.1.1)
 235     uint8_t  qt;                  // Quantization table selector.
 236 } TJEComponentSpec;
 237
 238 typedef struct
 239 {
 240     uint16_t         SOF;
 241     uint16_t         len;                   // 8 + 3 * frame.num_components
 242     uint8_t          precision;             // Sample precision (bits per sample).
 243     uint16_t         height;
 244     uint16_t         width;
 245     uint8_t          num_components;        // For this implementation, will be equal to 3.
 246     TJEComponentSpec component_spec[3];
 247 } TJEFrameHeader;
 248
 249 typedef struct
 250 {
 251     uint8_t component_id;                 // Just as with TJEComponentSpec
 252     uint8_t dc_ac;                        // (dc|ac)
 253 } TJEFrameComponentSpec;
 254
 255 typedef struct
 256 {
 257     uint16_t              SOS;
 258     uint16_t              len;
 259     uint8_t               num_components;  // 3.
 260     TJEFrameComponentSpec component_spec[3];
 261     uint8_t               first;  // 0
 262     uint8_t               last;  // 63
 263     uint8_t               ah_al;  // o
 264 } TJEScanHeader;
 265 #pragma pack(pop)
 266
 267
 268
 269
 270 static void tjei_write(TJEState* state, const void* data, size_t num_bytes, size_t num_elements)
 271 {
 272     size_t to_write = num_bytes * num_elements;
 273
 274     // Cap to the buffer available size and copy memory.
 275     size_t capped_count = tjei_min(to_write, TJEI_BUFFER_SIZE - 1 - state->output_buffer_count);
 276
 277     memcpy(state->output_buffer + state->output_buffer_count, data, capped_count);
 278     state->output_buffer_count += capped_count;
 279
 280     assert (state->output_buffer_count <= TJEI_BUFFER_SIZE - 1);
 281     // Flush the buffer.
 282     if ( state->output_buffer_count == TJEI_BUFFER_SIZE - 1 ) {
 283         state->write_context.func(state->write_context.context, state->output_buffer, (int)state->output_buffer_count);
 284         state->output_buffer_count = 0;
 285     }
 286
 287     // Recursively calling ourselves with the rest of the buffer.
 288     if (capped_count < to_write) {
 289         tjei_write(state, (uint8_t*)data+capped_count, to_write - capped_count, 1);
 290     }
 291 }
 292
 293 static void tjei_write_DQT(TJEState* state, const uint8_t* matrix, uint8_t id)
 294 {
 295     uint16_t DQT = tjei_be_word(0xffdb);
 296     tjei_write(state, &DQT, sizeof(uint16_t), 1);
 297     uint16_t len = tjei_be_word(0x0043); // 2(len) + 1(id) + 64(matrix) = 67 = 0x43
 298     tjei_write(state, &len, sizeof(uint16_t), 1);
 299     assert(id < 4);
 300     uint8_t precision_and_id = id;  // 0x0000 8 bits | 0x00id
 301     tjei_write(state, &precision_and_id, sizeof(uint8_t), 1);
 302     // Write matrix
 303     tjei_write(state, matrix, 64*sizeof(uint8_t), 1);
 304 }
 305
 306 typedef enum
 307 {
 308     TJEI_DC = 0,
 309     TJEI_AC = 1
 310 } TJEHuffmanTableClass;
 311
 312 static void tjei_write_DHT(TJEState* state,
 313                            uint8_t const * matrix_len,
 314                            uint8_t const * matrix_val,
 315                            TJEHuffmanTableClass ht_class,
 316                            uint8_t id)
 317 {
 318     int num_values = 0;
 319     for ( int i = 0; i < 16; ++i ) {
 320         num_values += matrix_len[i];
 321     }
 322     assert(num_values <= 0xffff);
 323
 324     uint16_t DHT = tjei_be_word(0xffc4);
 325     // 2(len) + 1(Tc|th) + 16 (num lengths) + ?? (num values)
 326     uint16_t len = tjei_be_word(2 + 1 + 16 + (uint16_t)num_values);
 327     assert(id < 4);
 328     uint8_t tc_th = (uint8_t)((((uint8_t)ht_class) << 4) | id);
 329
 330     tjei_write(state, &DHT, sizeof(uint16_t), 1);
 331     tjei_write(state, &len, sizeof(uint16_t), 1);
 332     tjei_write(state, &tc_th, sizeof(uint8_t), 1);
 333     tjei_write(state, matrix_len, sizeof(uint8_t), 16);
 334     tjei_write(state, matrix_val, sizeof(uint8_t), (size_t)num_values);
 335 }
 336 // ============================================================
 337 //  Huffman deflation code.
 338 // ============================================================
 339
 340 // Returns all code sizes from the BITS specification (JPEG C.3)
 341 static uint8_t* tjei_huff_get_code_lengths(uint8_t huffsize[/*256*/], uint8_t const * bits)
 342 {
 343     int k = 0;
 344     for ( int i = 0; i < 16; ++i ) {
 345         for ( int j = 0; j < bits[i]; ++j ) {
 346             huffsize[k++] = (uint8_t)(i + 1);
 347         }
 348         huffsize[k] = 0;
 349     }
 350     return huffsize;
 351 }
 352
 353 // Fills out the prefixes for each code.
 354 static uint16_t* tjei_huff_get_codes(uint16_t codes[], uint8_t* huffsize, int64_t count)
 355 {
 356     uint16_t code = 0;
 357     int k = 0;
 358     uint8_t sz = huffsize[0];
 359     for(;;) {
 360         do {
 361             assert(k < count);
 362             codes[k++] = code++;
 363         } while (huffsize[k] == sz);
 364         if (huffsize[k] == 0) {
 365             return codes;
 366         }
 367         do {
 368             code = (uint16_t)(code << 1);
 369             ++sz;
 370         } while( huffsize[k] != sz );
 371     }
 372 }
 373
 374 static void tjei_huff_get_extended(uint8_t* out_ehuffsize,
 375                                    uint16_t* out_ehuffcode,
 376                                    uint8_t const * huffval,
 377                                    uint8_t* huffsize,
 378                                    uint16_t* huffcode, int64_t count)
 379 {
 380     int k = 0;
 381     do {
 382         uint8_t val = huffval[k];
 383         out_ehuffcode[val] = huffcode[k];
 384         out_ehuffsize[val] = huffsize[k];
 385         k++;
 386     } while ( k < count );
 387 }
 388 // ============================================================
 389
 390 // Returns:
 391 //  out[1] : number of bits
 392 //  out[0] : bits
 393 TJEI_FORCE_INLINE void tjei_calculate_variable_length_int(int value, uint16_t out[2])
 394 {
 395     int abs_val = value;
 396     if ( value < 0 ) {
 397         abs_val = -abs_val;
 398         --value;
 399     }
 400     out[1] = 1;
 401     while( abs_val >>= 1 ) {
 402         ++out[1];
 403     }
 404     out[0] = (uint16_t)(value & ((1 << out[1]) - 1));
 405 }
 406
 407 // Write bits to file.
 408 TJEI_FORCE_INLINE void tjei_write_bits(TJEState* state,
 409                                        uint32_t* bitbuffer, uint32_t* location,
 410                                        uint16_t num_bits, uint16_t bits)
 411 {
 412     //   v-- location
 413     //  [                     ]   <-- bit buffer
 414     // 32                     0
 415     //
 416     // This call pushes to the bitbuffer and saves the location. Data is pushed
 417     // from most significant to less significant.
 418     // When we can write a full byte, we write a byte and shift.
 419
 420     // Push the stack.
 421     uint32_t nloc = *location + num_bits;
 422     *bitbuffer |= (uint32_t)(bits << (32 - nloc));
 423     *location = nloc;
 424     while ( *location >= 8 ) {
 425         // Grab the most significant byte.
 426         uint8_t c = (uint8_t)((*bitbuffer) >> 24);
 427         // Write it to file.
 428         tjei_write(state, &c, 1, 1);
 429         if ( c == 0xff )  {
 430             // Special case: tell JPEG this is not a marker.
 431             char z = 0;
 432             tjei_write(state, &z, 1, 1);
 433         }
 434         // Pop the stack.
 435         *bitbuffer <<= 8;
 436         *location -= 8;
 437     }
 438 }
 439
 440 // DCT implementation by Thomas G. Lane.
 441 // Obtained through NVIDIA
 442 //  http://developer.download.nvidia.com/SDK/9.5/Samples/vidimaging_samples.html#gpgpu_dct
 443 //
 444 // QUOTE:
 445 //  This implementation is based on Arai, Agui, and Nakajima's algorithm for
 446 //  scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
 447 //  Japanese, but the algorithm is described in the Pennebaker & Mitchell
 448 //  JPEG textbook (see REFERENCES section in file README).  The following code
 449 //  is based directly on figure 4-8 in P&M.
 450 //
 451 static void tjei_fdct (float * data)
 452 {
 453     float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 454     float tmp10, tmp11, tmp12, tmp13;
 455     float z1, z2, z3, z4, z5, z11, z13;
 456     float *dataptr;
 457     int ctr;
 458
 459     /* Pass 1: process rows. */
 460
 461     dataptr = data;
 462     for ( ctr = 7; ctr >= 0; ctr-- ) {
 463         tmp0 = dataptr[0] + dataptr[7];
 464         tmp7 = dataptr[0] - dataptr[7];
 465         tmp1 = dataptr[1] + dataptr[6];
 466         tmp6 = dataptr[1] - dataptr[6];
 467         tmp2 = dataptr[2] + dataptr[5];
 468         tmp5 = dataptr[2] - dataptr[5];
 469         tmp3 = dataptr[3] + dataptr[4];
 470         tmp4 = dataptr[3] - dataptr[4];
 471
 472         /* Even part */
 473
 474         tmp10 = tmp0 + tmp3;    /* phase 2 */
 475         tmp13 = tmp0 - tmp3;
 476         tmp11 = tmp1 + tmp2;
 477         tmp12 = tmp1 - tmp2;
 478
 479         dataptr[0] = tmp10 + tmp11; /* phase 3 */
 480         dataptr[4] = tmp10 - tmp11;
 481
 482         z1 = (tmp12 + tmp13) * ((float) 0.707106781); /* c4 */
 483         dataptr[2] = tmp13 + z1;    /* phase 5 */
 484         dataptr[6] = tmp13 - z1;
 485
 486         /* Odd part */
 487
 488         tmp10 = tmp4 + tmp5;    /* phase 2 */
 489         tmp11 = tmp5 + tmp6;
 490         tmp12 = tmp6 + tmp7;
 491
 492         /* The rotator is modified from fig 4-8 to avoid extra negations. */
 493         z5 = (tmp10 - tmp12) * ((float) 0.382683433); /* c6 */
 494         z2 = ((float) 0.541196100) * tmp10 + z5; /* c2-c6 */
 495         z4 = ((float) 1.306562965) * tmp12 + z5; /* c2+c6 */
 496         z3 = tmp11 * ((float) 0.707106781); /* c4 */
 497
 498         z11 = tmp7 + z3;        /* phase 5 */
 499         z13 = tmp7 - z3;
 500
 501         dataptr[5] = z13 + z2;  /* phase 6 */
 502         dataptr[3] = z13 - z2;
 503         dataptr[1] = z11 + z4;
 504         dataptr[7] = z11 - z4;
 505
 506         dataptr += 8;     /* advance pointer to next row */
 507     }
 508
 509     /* Pass 2: process columns. */
 510
 511     dataptr = data;
 512     for ( ctr = 8-1; ctr >= 0; ctr-- ) {
 513         tmp0 = dataptr[8*0] + dataptr[8*7];
 514         tmp7 = dataptr[8*0] - dataptr[8*7];
 515         tmp1 = dataptr[8*1] + dataptr[8*6];
 516         tmp6 = dataptr[8*1] - dataptr[8*6];
 517         tmp2 = dataptr[8*2] + dataptr[8*5];
 518         tmp5 = dataptr[8*2] - dataptr[8*5];
 519         tmp3 = dataptr[8*3] + dataptr[8*4];
 520         tmp4 = dataptr[8*3] - dataptr[8*4];
 521
 522         /* Even part */
 523
 524         tmp10 = tmp0 + tmp3;    /* phase 2 */
 525         tmp13 = tmp0 - tmp3;
 526         tmp11 = tmp1 + tmp2;
 527         tmp12 = tmp1 - tmp2;
 528
 529         dataptr[8*0] = tmp10 + tmp11; /* phase 3 */
 530         dataptr[8*4] = tmp10 - tmp11;
 531
 532         z1 = (tmp12 + tmp13) * ((float) 0.707106781); /* c4 */
 533         dataptr[8*2] = tmp13 + z1; /* phase 5 */
 534         dataptr[8*6] = tmp13 - z1;
 535
 536         /* Odd part */
 537
 538         tmp10 = tmp4 + tmp5;    /* phase 2 */
 539         tmp11 = tmp5 + tmp6;
 540         tmp12 = tmp6 + tmp7;
 541
 542         /* The rotator is modified from fig 4-8 to avoid extra negations. */
 543         z5 = (tmp10 - tmp12) * ((float) 0.382683433); /* c6 */
 544         z2 = ((float) 0.541196100) * tmp10 + z5; /* c2-c6 */
 545         z4 = ((float) 1.306562965) * tmp12 + z5; /* c2+c6 */
 546         z3 = tmp11 * ((float) 0.707106781); /* c4 */
 547
 548         z11 = tmp7 + z3;        /* phase 5 */
 549         z13 = tmp7 - z3;
 550
 551         dataptr[8*5] = z13 + z2; /* phase 6 */
 552         dataptr[8*3] = z13 - z2;
 553         dataptr[8*1] = z11 + z4;
 554         dataptr[8*7] = z11 - z4;
 555
 556         dataptr++;          /* advance pointer to next column */
 557     }
 558 }
 559 #if !TJE_USE_FAST_DCT
 560 static float slow_fdct(int u, int v, float* data)
 561 {
 562 #define kPI 3.14159265f
 563     float res = 0.0f;
 564     float cu = (u == 0) ? 0.70710678118654f : 1;
 565     float cv = (v == 0) ? 0.70710678118654f : 1;
 566     for ( int y = 0; y < 8; ++y ) {
 567         for ( int x = 0; x < 8; ++x ) {
 568             res += (data[y * 8 + x]) *
 569                     cosf(((2.0f * x + 1.0f) * u * kPI) / 16.0f) *
 570                     cosf(((2.0f * y + 1.0f) * v * kPI) / 16.0f);
 571         }
 572     }
 573     res *= 0.25f * cu * cv;
 574     return res;
 575 #undef kPI
 576 }
 577 #endif
 578
 579 #define ABS(x) ((x) < 0 ? -(x) : (x))
 580
 581 static void tjei_encode_and_write_MCU(TJEState* state,
 582                                       float* mcu,
 583 #if TJE_USE_FAST_DCT
 584                                       float* qt,  // Pre-processed quantization matrix.
 585 #else
 586                                       uint8_t* qt,
 587 #endif
 588                                       uint8_t* huff_dc_len, uint16_t* huff_dc_code, // Huffman tables
 589                                       uint8_t* huff_ac_len, uint16_t* huff_ac_code,
 590                                       int* pred,  // Previous DC coefficient
 591                                       uint32_t* bitbuffer,  // Bitstack.
 592                                       uint32_t* location)
 593 {
 594     int du[64];  // Data unit in zig-zag order
 595
 596     float dct_mcu[64];
 597     memcpy(dct_mcu, mcu, 64 * sizeof(float));
 598
 599 #if TJE_USE_FAST_DCT
 600     tjei_fdct(dct_mcu);
 601     for ( int i = 0; i < 64; ++i ) {
 602         float fval = dct_mcu[i];
 603         fval *= qt[i];
 604 #if 0
 605         fval = (fval > 0) ? floorf(fval + 0.5f) : ceilf(fval - 0.5f);
 606 #else
 607         fval = floorf(fval + 1024 + 0.5f);
 608         fval -= 1024;
 609 #endif
 610         int val = (int)fval;
 611         du[tjei_zig_zag[i]] = val;
 612     }
 613 #else
 614     for ( int v = 0; v < 8; ++v ) {
 615         for ( int u = 0; u < 8; ++u ) {
 616             dct_mcu[v * 8 + u] = slow_fdct(u, v, mcu);
 617         }
 618     }
 619     for ( int i = 0; i < 64; ++i ) {
 620         float fval = dct_mcu[i] / (qt[i]);
 621         int val = (int)((fval > 0) ? floorf(fval + 0.5f) : ceilf(fval - 0.5f));
 622         du[tjei_zig_zag[i]] = val;
 623     }
 624 #endif
 625
 626     uint16_t vli[2];
 627
 628     // Encode DC coefficient.
 629     int diff = du[0] - *pred;
 630     *pred = du[0];
 631     if ( diff != 0 ) {
 632         tjei_calculate_variable_length_int(diff, vli);
 633         // Write number of bits with Huffman coding
 634         tjei_write_bits(state, bitbuffer, location, huff_dc_len[vli[1]], huff_dc_code[vli[1]]);
 635         // Write the bits.
 636         tjei_write_bits(state, bitbuffer, location, vli[1], vli[0]);
 637     } else {
 638         tjei_write_bits(state, bitbuffer, location, huff_dc_len[0], huff_dc_code[0]);
 639     }
 640
 641     // ==== Encode AC coefficients ====
 642
 643     int last_non_zero_i = 0;
 644     // Find the last non-zero element.
 645     for ( int i = 63; i > 0; --i ) {
 646         if (du[i] != 0) {
 647             last_non_zero_i = i;
 648             break;
 649         }
 650     }
 651
 652     for ( int i = 1; i <= last_non_zero_i; ++i ) {
 653         // If zero, increase count. If >=15, encode (FF,00)
 654         int zero_count = 0;
 655         while ( du[i] == 0 ) {
 656             ++zero_count;
 657             ++i;
 658             if (zero_count == 16) {
 659                 // encode (ff,00) == 0xf0
 660                 tjei_write_bits(state, bitbuffer, location, huff_ac_len[0xf0], huff_ac_code[0xf0]);
 661                 zero_count = 0;
 662             }
 663         }
 664         tjei_calculate_variable_length_int(du[i], vli);
 665
 666         assert(zero_count < 0x10);
 667         assert(vli[1] <= 10);
 668
 669         uint16_t sym1 = (uint16_t)((uint16_t)zero_count << 4) | vli[1];
 670
 671         assert(huff_ac_len[sym1] != 0);
 672
 673         // Write symbol 1  --- (RUNLENGTH, SIZE)
 674         tjei_write_bits(state, bitbuffer, location, huff_ac_len[sym1], huff_ac_code[sym1]);
 675         // Write symbol 2  --- (AMPLITUDE)
 676         tjei_write_bits(state, bitbuffer, location, vli[1], vli[0]);
 677     }
 678
 679     if (last_non_zero_i != 63) {
 680         // write EOB HUFF(00,00)
 681         tjei_write_bits(state, bitbuffer, location, huff_ac_len[0], huff_ac_code[0]);
 682     }
 683     return;
 684 }
 685
 686 enum {
 687     TJEI_LUMA_DC,
 688     TJEI_LUMA_AC,
 689     TJEI_CHROMA_DC,
 690     TJEI_CHROMA_AC,
 691 };
 692
 693 #if TJE_USE_FAST_DCT
 694 struct TJEProcessedQT
 695 {
 696     float chroma[64];
 697     float luma[64];
 698 };
 699 #endif
 700
 701 // Set up huffman tables in state.
 702 static void tjei_huff_expand(TJEState* state)
 703 {
 704     assert(state);
 705
 706     state->ht_bits[TJEI_LUMA_DC]   = tjei_default_ht_luma_dc_len;
 707     state->ht_bits[TJEI_LUMA_AC]   = tjei_default_ht_luma_ac_len;
 708     state->ht_bits[TJEI_CHROMA_DC] = tjei_default_ht_chroma_dc_len;
 709     state->ht_bits[TJEI_CHROMA_AC] = tjei_default_ht_chroma_ac_len;
 710
 711     state->ht_vals[TJEI_LUMA_DC]   = tjei_default_ht_luma_dc;
 712     state->ht_vals[TJEI_LUMA_AC]   = tjei_default_ht_luma_ac;
 713     state->ht_vals[TJEI_CHROMA_DC] = tjei_default_ht_chroma_dc;
 714     state->ht_vals[TJEI_CHROMA_AC] = tjei_default_ht_chroma_ac;
 715
 716     // How many codes in total for each of LUMA_(DC|AC) and CHROMA_(DC|AC)
 717     int32_t spec_tables_len[4] = { 0 };
 718
 719     for ( int i = 0; i < 4; ++i ) {
 720         for ( int k = 0; k < 16; ++k ) {
 721             spec_tables_len[i] += state->ht_bits[i][k];
 722         }
 723     }
 724
 725     // Fill out the extended tables..
 726     uint8_t huffsize[4][257];
 727     uint16_t huffcode[4][256];
 728     for ( int i = 0; i < 4; ++i ) {
 729         assert (256 >= spec_tables_len[i]);
 730         tjei_huff_get_code_lengths(huffsize[i], state->ht_bits[i]);
 731         tjei_huff_get_codes(huffcode[i], huffsize[i], spec_tables_len[i]);
 732     }
 733     for ( int i = 0; i < 4; ++i ) {
 734         int64_t count = spec_tables_len[i];
 735         tjei_huff_get_extended(state->ehuffsize[i],
 736                                state->ehuffcode[i],
 737                                state->ht_vals[i],
 738                                &huffsize[i][0],
 739                                &huffcode[i][0], count);
 740     }
 741 }
 742
 743 //static int tjei_encode_main(TJEState* state,
 744 //                            const unsigned char* src_data,
 745 //                            const int width,
 746 //                            const int height,
 747 //                            const int src_num_components)
 748 //{
 749 //    if (src_num_components != 3 && src_num_components != 4) {
 750 //        return 0;
 751 //    }
 752 //
 753 //    if (width > 0xffff || height > 0xffff) {
 754 //        return 0;
 755 //    }
 756 //
 757 //#if TJE_USE_FAST_DCT
 758 //    struct TJEProcessedQT pqt;
 759 //    // Again, taken from classic japanese implementation.
 760 //    //
 761 //    /* For float AA&N IDCT method, divisors are equal to quantization
 762 //     * coefficients scaled by scalefactor[row]*scalefactor[col], where
 763 //     *   scalefactor[0] = 1
 764 //     *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
 765 //     * We apply a further scale factor of 8.
 766 //     * What's actually stored is 1/divisor so that the inner loop can
 767 //     * use a multiplication rather than a division.
 768 //     */
 769 //
 770 //
 771 //    // build (de)quantization tables
 772 //    for(int y=0; y<8; y++) {
 773 //        for(int x=0; x<8; x++) {
 774 //            int i = y*8 + x;
 775 //            pqt.luma[y*8+x] = 1.0f / (8 * aan_scales[x] * aan_scales[y] * state->qt_luma[tjei_zig_zag[i]]);
 776 //            pqt.chroma[y*8+x] = 1.0f / (8 * aan_scales[x] * aan_scales[y] * state->qt_chroma[tjei_zig_zag[i]]);
 777 //        }
 778 //    }
 779 //#endif
 780 //
 781 //    { // Write header
 782 //        TJEJPEGHeader header;
 783 //        // JFIF header.
 784 //        header.SOI = tjei_be_word(0xffd8);  // Sequential DCT
 785 //        header.APP0 = tjei_be_word(0xffe0);
 786 //
 787 //        uint16_t jfif_len = sizeof(TJEJPEGHeader) - 4 /*SOI & APP0 markers*/;
 788 //        header.jfif_len = tjei_be_word(jfif_len);
 789 //        memcpy(header.jfif_id, (void*)tjeik_jfif_id, 5);
 790 //        header.version = tjei_be_word(0x0102);
 791 //        header.units = 0x01;  // Dots-per-inch
 792 //        header.x_density = tjei_be_word(0x0060);  // 96 DPI
 793 //        header.y_density = tjei_be_word(0x0060);  // 96 DPI
 794 //        header.x_thumb = 0;
 795 //        header.y_thumb = 0;
 796 //        tjei_write(state, &header, sizeof(TJEJPEGHeader), 1);
 797 //    }
 798 //    {  // Write comment
 799 //        TJEJPEGComment com;
 800 //        uint16_t com_len = 2 + sizeof(tjeik_com_str) - 1;
 801 //        // Comment
 802 //        com.com = tjei_be_word(0xfffe);
 803 //        com.com_len = tjei_be_word(com_len);
 804 //        memcpy(com.com_str, (void*)tjeik_com_str, sizeof(tjeik_com_str)-1);
 805 //        tjei_write(state, &com, sizeof(TJEJPEGComment), 1);
 806 //    }
 807 //
 808 //    // Write quantization tables.
 809 //    tjei_write_DQT(state, state->qt_luma, 0x00);
 810 //    tjei_write_DQT(state, state->qt_chroma, 0x01);
 811 //
 812 //    {  // Write the frame marker.
 813 //        TJEFrameHeader header;
 814 //        header.SOF = tjei_be_word(0xffc0);
 815 //        header.len = tjei_be_word(8 + 3 * 3);
 816 //        header.precision = 8;
 817 //        assert(width <= 0xffff);
 818 //        assert(height <= 0xffff);
 819 //        header.width = tjei_be_word((uint16_t)width);
 820 //        header.height = tjei_be_word((uint16_t)height);
 821 //        header.num_components = 3;
 822 //        uint8_t tables[3] = {
 823 //            0,  // Luma component gets luma table (see tjei_write_DQT call above.)
 824 //            1,  // Chroma component gets chroma table
 825 //            1,  // Chroma component gets chroma table
 826 //        };
 827 //        for (int i = 0; i < 3; ++i) {
 828 //            TJEComponentSpec spec;
 829 //            spec.component_id = (uint8_t)(i + 1);  // No particular reason. Just 1, 2, 3.
 830 //            spec.sampling_factors = (uint8_t)0x11;
 831 //            spec.qt = tables[i];
 832 //
 833 //            header.component_spec[i] = spec;
 834 //        }
 835 //        // Write to file.
 836 //        tjei_write(state, &header, sizeof(TJEFrameHeader), 1);
 837 //    }
 838 //
 839 //    tjei_write_DHT(state, state->ht_bits[TJEI_LUMA_DC],   state->ht_vals[TJEI_LUMA_DC], TJEI_DC, 0);
 840 //    tjei_write_DHT(state, state->ht_bits[TJEI_LUMA_AC],   state->ht_vals[TJEI_LUMA_AC], TJEI_AC, 0);
 841 //    tjei_write_DHT(state, state->ht_bits[TJEI_CHROMA_DC], state->ht_vals[TJEI_CHROMA_DC], TJEI_DC, 1);
 842 //    tjei_write_DHT(state, state->ht_bits[TJEI_CHROMA_AC], state->ht_vals[TJEI_CHROMA_AC], TJEI_AC, 1);
 843 //
 844 //    // Write start of scan
 845 //    {
 846 //        TJEScanHeader header;
 847 //        header.SOS = tjei_be_word(0xffda);
 848 //        header.len = tjei_be_word((uint16_t)(6 + (sizeof(TJEFrameComponentSpec) * 3)));
 849 //        header.num_components = 3;
 850 //
 851 //        uint8_t tables[3] = {
 852 //            0x00,
 853 //            0x11,
 854 //            0x11,
 855 //        };
 856 //        for (int i = 0; i < 3; ++i) {
 857 //            TJEFrameComponentSpec cs;
 858 //            // Must be equal to component_id from frame header above.
 859 //            cs.component_id = (uint8_t)(i + 1);
 860 //            cs.dc_ac = (uint8_t)tables[i];
 861 //
 862 //            header.component_spec[i] = cs;
 863 //        }
 864 //        header.first = 0;
 865 //        header.last  = 63;
 866 //        header.ah_al = 0;
 867 //        tjei_write(state, &header, sizeof(TJEScanHeader), 1);
 868 //
 869 //    }
 870 //    // Write compressed data.
 871 //
 872 //    float du_y[64];
 873 //    float du_b[64];
 874 //    float du_r[64];
 875 //
 876 //    // Set diff to 0.
 877 //    int pred_y = 0;
 878 //    int pred_b = 0;
 879 //    int pred_r = 0;
 880 //
 881 //    // Bit stack
 882 //    uint32_t bitbuffer = 0;
 883 //    uint32_t location = 0;
 884 //
 885 //
 886 //    for ( int y = 0; y < height; y += 8 ) {
 887 //        for ( int x = 0; x < width; x += 8 ) {
 888 //            // Block loop: ====
 889 //            for ( int off_y = 0; off_y < 8; ++off_y ) {
 890 //                for ( int off_x = 0; off_x < 8; ++off_x ) {
 891 //                    int block_index = (off_y * 8 + off_x);
 892 //
 893 //                    int src_index = (((y + off_y) * width) + (x + off_x)) * src_num_components;
 894 //
 895 //                    int col = x + off_x;
 896 //                    int row = y + off_y;
 897 //
 898 //                    if(row >= height) {
 899 //                        src_index -= (width * (row - height + 1)) * src_num_components;
 900 //                    }
 901 //                    if(col >= width) {
 902 //                        src_index -= (col - width + 1) * src_num_components;
 903 //                    }
 904 //                    assert(src_index < width * height * src_num_components);
 905 //
 906 //                    uint8_t r = src_data[src_index + 0];
 907 //                    uint8_t g = src_data[src_index + 1];
 908 //                    uint8_t b = src_data[src_index + 2];
 909 //
 910 //                    float luma = 0.299f   * r + 0.587f    * g + 0.114f    * b - 128;
 911 //                    float cb   = -0.1687f * r - 0.3313f   * g + 0.5f      * b;
 912 //                    float cr   = 0.5f     * r - 0.4187f   * g - 0.0813f   * b;
 913 //
 914 //                    du_y[block_index] = luma;
 915 //                    du_b[block_index] = cb;
 916 //                    du_r[block_index] = cr;
 917 //                }
 918 //            }
 919 //
 920 //            tjei_encode_and_write_MCU(state, du_y,
 921 //#if TJE_USE_FAST_DCT
 922 //                                     pqt.luma,
 923 //#else
 924 //                                     state->qt_luma,
 925 //#endif
 926 //                                     state->ehuffsize[TJEI_LUMA_DC], state->ehuffcode[TJEI_LUMA_DC],
 927 //                                     state->ehuffsize[TJEI_LUMA_AC], state->ehuffcode[TJEI_LUMA_AC],
 928 //                                     &pred_y, &bitbuffer, &location);
 929 //            tjei_encode_and_write_MCU(state, du_b,
 930 //#if TJE_USE_FAST_DCT
 931 //                                     pqt.chroma,
 932 //#else
 933 //                                     state->qt_chroma,
 934 //#endif
 935 //                                     state->ehuffsize[TJEI_CHROMA_DC], state->ehuffcode[TJEI_CHROMA_DC],
 936 //                                     state->ehuffsize[TJEI_CHROMA_AC], state->ehuffcode[TJEI_CHROMA_AC],
 937 //                                     &pred_b, &bitbuffer, &location);
 938 //            tjei_encode_and_write_MCU(state, du_r,
 939 //#if TJE_USE_FAST_DCT
 940 //                                     pqt.chroma,
 941 //#else
 942 //                                     state->qt_chroma,
 943 //#endif
 944 //                                     state->ehuffsize[TJEI_CHROMA_DC], state->ehuffcode[TJEI_CHROMA_DC],
 945 //                                     state->ehuffsize[TJEI_CHROMA_AC], state->ehuffcode[TJEI_CHROMA_AC],
 946 //                                     &pred_r, &bitbuffer, &location);
 947 //
 948 //
 949 //        }
 950 //    }
 951 //
 952 //    // Finish the image.
 953 //    { // Flush
 954 //        if (location > 0 && location < 8) {
 955 //            tjei_write_bits(state, &bitbuffer, &location, (uint16_t)(8 - location), 0);
 956 //        }
 957 //    }
 958 //    uint16_t EOI = tjei_be_word(0xffd9);
 959 //    tjei_write(state, &EOI, sizeof(uint16_t), 1);
 960 //
 961 //    if (state->output_buffer_count) {
 962 //        state->write_context.func(state->write_context.context, state->output_buffer, (int)state->output_buffer_count);
 963 //        state->output_buffer_count = 0;
 964 //    }
 965 //
 966 //    return 1;
 967 //}
 968
 969
 970 //int tje_encode_with_func(tje_write_func* func,
 971 //                         void* context,
 972 //                         const int quality,
 973 //                         const int width,
 974 //                         const int height,
 975 //                         const int num_components,
 976 //                         const unsigned char* src_data)
 977 //{
 978 //    if (quality < 1 || quality > 3) {
 979 //        tje_log("[ERROR] -- Valid 'quality' values are 1 (lowest), 2, or 3 (highest)");
 980 //        return 0;
 981 //    }
 982 //
 983 //    TJEState state = { 0 };
 984 //
 985 //
 986 //
 987 //    TJEWriteContext wc = { 0 };
 988 //
 989 //    wc.context = context;
 990 //    wc.func = func;
 991 //
 992 //    state.write_context = wc;
 993 //
 994 //
 995 //    tjei_huff_expand(&state);
 996 //
 997 //    int result = tjei_encode_main(&state, src_data, width, height, num_components);
 998 //
 999 //    return result;
1000 //}
1001 // ============================================================
1002
1003 typedef struct
1004 {
1005         TJEState encode_state;
1006 #if TJE_USE_FAST_DCT
1007     struct TJEProcessedQT pqt;
1008 #endif
1009         uint32_t width;
1010         uint32_t height;
1011         uint32_t num_components;
1012         uint32_t cur_height;
1013     // Set diff to 0.
1014     int pred_y;
1015     int pred_b;
1016     int pred_r;
1017
1018     // Bit stack
1019     uint32_t bitbuffer;
1020     uint32_t location;
1021 }TJE_ContextStruct;
1022 void *jpeg_encode_init(tje_write_func* func, void* context, uint8_t quality, uint32_t width, uint32_t height, uint8_t src_num_components)
1023 {
1024     if (quality < 1 || quality > 3) {
1025         tje_log("Valid 'quality' %d values are 1 (lowest), 2, or 3 (highest)", quality);
1026         return NULL;
1027     }
1028     if (src_num_components != 3 && src_num_components != 4) {
1029         return NULL;
1030     }
1031
1032     if (width > 0xffff || height > 0xffff) {
1033         return NULL;
1034     }
1035
1036         TJE_ContextStruct *ctx = calloc(1, sizeof(TJE_ContextStruct));
1037     uint8_t qt_factor = 1;
1038     switch(quality) {
1039     case 3:
1040         for ( int i = 0; i < 64; ++i ) {
1041                 ctx->encode_state.qt_luma[i]   = 1;
1042                 ctx->encode_state.qt_chroma[i] = 1;
1043         }
1044         break;
1045     case 2:
1046         qt_factor = 10;
1047         // don't break. fall through.
1048     case 1:
1049         for ( int i = 0; i < 64; ++i ) {
1050                 ctx->encode_state.qt_luma[i]   = tjei_default_qt_luma_from_spec[i] / qt_factor;
1051             if (ctx->encode_state.qt_luma[i] == 0) {
1052                 ctx->encode_state.qt_luma[i] = 1;
1053             }
1054             ctx->encode_state.qt_chroma[i] = tjei_default_qt_chroma_from_paper[i] / qt_factor;
1055             if (ctx->encode_state.qt_chroma[i] == 0) {
1056                 ctx->encode_state.qt_chroma[i] = 1;
1057             }
1058         }
1059         break;
1060     default:
1061         assert(!"invalid code path");
1062         break;
1063     }
1064
1065     ctx->encode_state.write_context.func = func;
1066     ctx->encode_state.write_context.context = context;
1067     ctx->width = width;
1068     ctx->height = height;
1069     ctx->num_components = src_num_components;
1070     tjei_huff_expand(&ctx->encode_state);
1071     TJEState* state = &ctx->encode_state;
1072 #if TJE_USE_FAST_DCT
1073     // Again, taken from classic japanese implementation.
1074     //
1075     /* For float AA&N IDCT method, divisors are equal to quantization
1076      * coefficients scaled by scalefactor[row]*scalefactor[col], where
1077      *   scalefactor[0] = 1
1078      *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
1079      * We apply a further scale factor of 8.
1080      * What's actually stored is 1/divisor so that the inner loop can
1081      * use a multiplication rather than a division.
1082      */
1083
1084
1085     // build (de)quantization tables
1086     for(int y=0; y<8; y++) {
1087         for(int x=0; x<8; x++) {
1088             int i = y*8 + x;
1089             ctx->pqt.luma[y*8+x] = 1.0f / (8 * aan_scales[x] * aan_scales[y] * state->qt_luma[tjei_zig_zag[i]]);
1090             ctx->pqt.chroma[y*8+x] = 1.0f / (8 * aan_scales[x] * aan_scales[y] * state->qt_chroma[tjei_zig_zag[i]]);
1091         }
1092     }
1093 #endif
1094
1095     { // Write header
1096         TJEJPEGHeader header;
1097         // JFIF header.
1098         header.SOI = tjei_be_word(0xffd8);  // Sequential DCT
1099         header.APP0 = tjei_be_word(0xffe0);
1100
1101         uint16_t jfif_len = sizeof(TJEJPEGHeader) - 4 /*SOI & APP0 markers*/;
1102         header.jfif_len = tjei_be_word(jfif_len);
1103         memcpy(header.jfif_id, (void*)tjeik_jfif_id, 5);
1104         header.version = tjei_be_word(0x0102);
1105         header.units = 0x01;  // Dots-per-inch
1106         header.x_density = tjei_be_word(0x0060);  // 96 DPI
1107         header.y_density = tjei_be_word(0x0060);  // 96 DPI
1108         header.x_thumb = 0;
1109         header.y_thumb = 0;
1110         tjei_write(state, &header, sizeof(TJEJPEGHeader), 1);
1111     }
1112     {  // Write comment
1113         TJEJPEGComment com;
1114         uint16_t com_len = 2 + sizeof(tjeik_com_str) - 1;
1115         // Comment
1116         com.com = tjei_be_word(0xfffe);
1117         com.com_len = tjei_be_word(com_len);
1118         memcpy(com.com_str, (void*)tjeik_com_str, sizeof(tjeik_com_str)-1);
1119         tjei_write(state, &com, sizeof(TJEJPEGComment), 1);
1120     }
1121
1122     // Write quantization tables.
1123     tjei_write_DQT(state, state->qt_luma, 0x00);
1124     tjei_write_DQT(state, state->qt_chroma, 0x01);
1125
1126     {  // Write the frame marker.
1127         TJEFrameHeader header;
1128         header.SOF = tjei_be_word(0xffc0);
1129         header.len = tjei_be_word(8 + 3 * 3);
1130         header.precision = 8;
1131         assert(width <= 0xffff);
1132         assert(height <= 0xffff);
1133         header.width = tjei_be_word((uint16_t)width);
1134         header.height = tjei_be_word((uint16_t)height);
1135         header.num_components = 3;
1136         uint8_t tables[3] = {
1137             0,  // Luma component gets luma table (see tjei_write_DQT call above.)
1138             1,  // Chroma component gets chroma table
1139             1,  // Chroma component gets chroma table
1140         };
1141         for (int i = 0; i < 3; ++i) {
1142             TJEComponentSpec spec;
1143             spec.component_id = (uint8_t)(i + 1);  // No particular reason. Just 1, 2, 3.
1144             spec.sampling_factors = (uint8_t)0x11;
1145             spec.qt = tables[i];
1146
1147             header.component_spec[i] = spec;
1148         }
1149         // Write to file.
1150         tjei_write(state, &header, sizeof(TJEFrameHeader), 1);
1151     }
1152
1153     tjei_write_DHT(state, state->ht_bits[TJEI_LUMA_DC],   state->ht_vals[TJEI_LUMA_DC], TJEI_DC, 0);
1154     tjei_write_DHT(state, state->ht_bits[TJEI_LUMA_AC],   state->ht_vals[TJEI_LUMA_AC], TJEI_AC, 0);
1155     tjei_write_DHT(state, state->ht_bits[TJEI_CHROMA_DC], state->ht_vals[TJEI_CHROMA_DC], TJEI_DC, 1);
1156     tjei_write_DHT(state, state->ht_bits[TJEI_CHROMA_AC], state->ht_vals[TJEI_CHROMA_AC], TJEI_AC, 1);
1157
1158     // Write start of scan
1159     {
1160         TJEScanHeader header;
1161         header.SOS = tjei_be_word(0xffda);
1162         header.len = tjei_be_word((uint16_t)(6 + (sizeof(TJEFrameComponentSpec) * 3)));
1163         header.num_components = 3;
1164
1165         uint8_t tables[3] = {
1166             0x00,
1167             0x11,
1168             0x11,
1169         };
1170         for (int i = 0; i < 3; ++i) {
1171             TJEFrameComponentSpec cs;
1172             // Must be equal to component_id from frame header above.
1173             cs.component_id = (uint8_t)(i + 1);
1174             cs.dc_ac = (uint8_t)tables[i];
1175
1176             header.component_spec[i] = cs;
1177         }
1178         header.first = 0;
1179         header.last  = 63;
1180         header.ah_al = 0;
1181         tjei_write(state, &header, sizeof(TJEScanHeader), 1);
1182
1183     }
1184     return ctx;
1185 }
1186
1187 void jpeg_encode_run(void *ctx, uint8_t *src_data)
1188 {
1189     float du_y[64];
1190     float du_b[64];
1191     float du_r[64];
1192     TJE_ContextStruct *handle = (TJE_ContextStruct *)ctx;
1193     TJEState* state = &handle->encode_state;
1194     uint32_t width = handle->width;
1195     uint32_t height = handle->height;
1196     uint32_t src_num_components = handle->num_components;
1197     uint32_t block_index, src_index, col, row;
1198     uint8_t r,g,b;
1199         for ( uint32_t x = 0; x < width; x += 8 ) {
1200                 // Block loop: ====
1201                 for ( uint32_t off_y = 0; off_y < 8; ++off_y ) {
1202                         for ( uint32_t off_x = 0; off_x < 8; ++off_x ) {
1203                                 block_index = (off_y * 8 + off_x);
1204                                 src_index = (((0 + off_y) * width) + (x + off_x)) * src_num_components;
1205 //                              if (is_rgb)
1206 //                              {
1207 //                                      r = src_data[src_index + 0];
1208 //                                      g = src_data[src_index + 1];
1209 //                                      b = src_data[src_index + 2];
1210 //                                      du_y[block_index] = 0.299f   * r + 0.587f    * g + 0.114f    * b - 128;
1211 //                                      du_b[block_index] = -0.1687f * r - 0.3313f   * g + 0.5f      * b;
1212 //                                      du_r[block_index] = 0.5f     * r - 0.4187f   * g - 0.0813f   * b;
1213 //                              }
1214 //                              else
1215 //                              {
1216 //                                      du_y[block_index] = src_data[src_index + 0];
1217 //                                      du_b[block_index] = src_data[src_index + 1];
1218 //                                      du_r[block_index] = src_data[src_index + 2];
1219 //                                      du_y[block_index] -= 128;
1220 //                                      du_b[block_index] -= 128;
1221 //                                      du_r[block_index] -= 128;
1222 //                              }
1223                                 du_y[block_index] = src_data[src_index + 0];
1224                                 du_b[block_index] = src_data[src_index + 1];
1225                                 du_r[block_index] = src_data[src_index + 2];
1226                                 du_y[block_index] -= 128;
1227                                 du_b[block_index] -= 128;
1228                                 du_r[block_index] -= 128;
1229                         }
1230                 }
1231
1232                 tjei_encode_and_write_MCU(state, du_y,
1233 #if TJE_USE_FAST_DCT
1234                                 handle->pqt.luma,
1235 #else
1236                                                                  state->qt_luma,
1237 #endif
1238                                                                  state->ehuffsize[TJEI_LUMA_DC], state->ehuffcode[TJEI_LUMA_DC],
1239                                                                  state->ehuffsize[TJEI_LUMA_AC], state->ehuffcode[TJEI_LUMA_AC],
1240                                                                  &handle->pred_y, &handle->bitbuffer, &handle->location);
1241                 tjei_encode_and_write_MCU(state, du_b,
1242 #if TJE_USE_FAST_DCT
1243                                 handle->pqt.chroma,
1244 #else
1245                                                                  state->qt_chroma,
1246 #endif
1247                                                                  state->ehuffsize[TJEI_CHROMA_DC], state->ehuffcode[TJEI_CHROMA_DC],
1248                                                                  state->ehuffsize[TJEI_CHROMA_AC], state->ehuffcode[TJEI_CHROMA_AC],
1249                                                                  &handle->pred_b, &handle->bitbuffer, &handle->location);
1250                 tjei_encode_and_write_MCU(state, du_r,
1251 #if TJE_USE_FAST_DCT
1252                                 handle->pqt.chroma,
1253 #else
1254                                                                  state->qt_chroma,
1255 #endif
1256                                                                  state->ehuffsize[TJEI_CHROMA_DC], state->ehuffcode[TJEI_CHROMA_DC],
1257                                                                  state->ehuffsize[TJEI_CHROMA_AC], state->ehuffcode[TJEI_CHROMA_AC],
1258                                                                  &handle->pred_r, &handle->bitbuffer, &handle->location);
1259
1260
1261         }
1262     handle->cur_height += 8;
1263 }
1264
1265 void jpeg_encode_end(void *ctx)
1266 {
1267     uint16_t EOI = tjei_be_word(0xffd9);
1268     TJE_ContextStruct *handle = (TJE_ContextStruct *)ctx;
1269     TJEState* state = &handle->encode_state;
1270     tjei_write(state, &EOI, sizeof(uint16_t), 1);
1271
1272     if (state->output_buffer_count) {
1273         state->write_context.func(state->write_context.context, state->output_buffer, (int)state->output_buffer_count);
1274         state->output_buffer_count = 0;
1275     }
1276 }