libavcodec/movtextdec.c

   1 /*
   2  * 3GPP TS 26.245 Timed Text decoder
   3  * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "avcodec.h"
  23 #include "ass.h"
  24 #include "libavutil/opt.h"
  25 #include "libavutil/avstring.h"
  26 #include "libavutil/common.h"
  27 #include "libavutil/bprint.h"
  28 #include "libavutil/intreadwrite.h"
  29 #include "libavutil/mem.h"
  30 #include "bytestream.h"
  31 #include "codec_internal.h"
  32
  33 #define STYLE_FLAG_BOLD         (1<<0)
  34 #define STYLE_FLAG_ITALIC       (1<<1)
  35 #define STYLE_FLAG_UNDERLINE    (1<<2)
  36
  37 #define BOX_SIZE_INITIAL    40
  38
  39 #define STYL_BOX   (1<<0)
  40 #define HLIT_BOX   (1<<1)
  41 #define HCLR_BOX   (1<<2)
  42 #define TWRP_BOX   (1<<3)
  43
  44 #define BOTTOM_LEFT     1
  45 #define BOTTOM_CENTER   2
  46 #define BOTTOM_RIGHT    3
  47 #define MIDDLE_LEFT     4
  48 #define MIDDLE_CENTER   5
  49 #define MIDDLE_RIGHT    6
  50 #define TOP_LEFT        7
  51 #define TOP_CENTER      8
  52 #define TOP_RIGHT       9
  53
  54 #define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff))
  55
  56 typedef struct {
  57     uint16_t font_id;
  58     char *font;
  59 } FontRecord;
  60
  61 typedef struct {
  62     uint16_t start;
  63     uint16_t end;
  64     uint8_t flags;
  65     uint8_t bold;
  66     uint8_t italic;
  67     uint8_t underline;
  68     int color;
  69     uint8_t alpha;
  70     uint8_t fontsize;
  71     uint16_t font_id;
  72 } StyleBox;
  73
  74 typedef struct {
  75     StyleBox style;
  76     const char *font;
  77     int back_color;
  78     uint8_t back_alpha;
  79     int alignment;
  80 } MovTextDefault;
  81
  82 typedef struct {
  83     uint16_t hlit_start;
  84     uint16_t hlit_end;
  85 } HighlightBox;
  86
  87 typedef struct {
  88    uint8_t hlit_color[4];
  89 } HilightcolorBox;
  90
  91 typedef struct {
  92     uint8_t wrap_flag;
  93 } TextWrapBox;
  94
  95 typedef struct {
  96     AVClass *class;
  97     StyleBox *s;
  98     HighlightBox h;
  99     HilightcolorBox c;
 100     FontRecord *ftab;
 101     TextWrapBox w;
 102     MovTextDefault d;
 103     uint8_t box_flags;
 104     uint16_t style_entries, ftab_entries;
 105     int readorder;
 106     int frame_width;
 107     int frame_height;
 108 } MovTextContext;
 109
 110 typedef struct {
 111     uint32_t type;
 112     unsigned base_size;
 113     int (*decode)(const uint8_t *tsmb, MovTextContext *m, uint64_t size);
 114 } Box;
 115
 116 static void mov_text_cleanup(MovTextContext *m)
 117 {
 118     if (m->box_flags & STYL_BOX) {
 119         av_freep(&m->s);
 120         m->style_entries = 0;
 121     }
 122 }
 123
 124 static void mov_text_cleanup_ftab(MovTextContext *m)
 125 {
 126     for (unsigned i = 0; i < m->ftab_entries; i++)
 127         av_freep(&m->ftab[i].font);
 128     av_freep(&m->ftab);
 129     m->ftab_entries = 0;
 130 }
 131
 132 static void mov_text_parse_style_record(StyleBox *style, const uint8_t **ptr)
 133 {
 134     // fontID
 135     style->font_id   = bytestream_get_be16(ptr);
 136     // face-style-flags
 137     style->flags     = bytestream_get_byte(ptr);
 138     style->bold      = !!(style->flags & STYLE_FLAG_BOLD);
 139     style->italic    = !!(style->flags & STYLE_FLAG_ITALIC);
 140     style->underline = !!(style->flags & STYLE_FLAG_UNDERLINE);
 141     // fontsize
 142     style->fontsize  = bytestream_get_byte(ptr);
 143     // Primary color
 144     style->color     = bytestream_get_be24(ptr);
 145     style->color     = RGB_TO_BGR(style->color);
 146     style->alpha     = bytestream_get_byte(ptr);
 147 }
 148
 149 static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
 150 {
 151     const uint8_t *tx3g_ptr = avctx->extradata;
 152     int i, j = -1, font_length, remaining = avctx->extradata_size - BOX_SIZE_INITIAL;
 153     int8_t v_align, h_align;
 154     unsigned ftab_entries;
 155
 156     m->ftab_entries = 0;
 157     if (remaining < 0)
 158         return -1;
 159
 160     // Display Flags
 161     tx3g_ptr += 4;
 162     // Alignment
 163     h_align = bytestream_get_byte(&tx3g_ptr);
 164     v_align = bytestream_get_byte(&tx3g_ptr);
 165     if (h_align == 0) {
 166         if (v_align == 0)
 167             m->d.alignment = TOP_LEFT;
 168         if (v_align == 1)
 169             m->d.alignment = MIDDLE_LEFT;
 170         if (v_align == -1)
 171             m->d.alignment = BOTTOM_LEFT;
 172     }
 173     if (h_align == 1) {
 174         if (v_align == 0)
 175             m->d.alignment = TOP_CENTER;
 176         if (v_align == 1)
 177             m->d.alignment = MIDDLE_CENTER;
 178         if (v_align == -1)
 179             m->d.alignment = BOTTOM_CENTER;
 180     }
 181     if (h_align == -1) {
 182         if (v_align == 0)
 183             m->d.alignment = TOP_RIGHT;
 184         if (v_align == 1)
 185             m->d.alignment = MIDDLE_RIGHT;
 186         if (v_align == -1)
 187             m->d.alignment = BOTTOM_RIGHT;
 188     }
 189     // Background Color
 190     m->d.back_color = bytestream_get_be24(&tx3g_ptr);
 191     m->d.back_color = RGB_TO_BGR(m->d.back_color);
 192     m->d.back_alpha = bytestream_get_byte(&tx3g_ptr);
 193     // BoxRecord
 194     tx3g_ptr += 8;
 195     // StyleRecord
 196     tx3g_ptr += 4;
 197     mov_text_parse_style_record(&m->d.style, &tx3g_ptr);
 198     // FontRecord
 199     // FontRecord Size
 200     tx3g_ptr += 4;
 201     // ftab
 202     tx3g_ptr += 4;
 203
 204     // In case of broken header, init default font
 205     m->d.font = ASS_DEFAULT_FONT;
 206
 207     ftab_entries = bytestream_get_be16(&tx3g_ptr);
 208     if (!ftab_entries)
 209         return 0;
 210     remaining   -= 3 * ftab_entries;
 211     if (remaining < 0)
 212         return AVERROR_INVALIDDATA;
 213     m->ftab = av_calloc(ftab_entries, sizeof(*m->ftab));
 214     if (!m->ftab)
 215         return AVERROR(ENOMEM);
 216     m->ftab_entries = ftab_entries;
 217
 218     for (i = 0; i < m->ftab_entries; i++) {
 219         m->ftab[i].font_id = bytestream_get_be16(&tx3g_ptr);
 220         if (m->ftab[i].font_id == m->d.style.font_id)
 221             j = i;
 222         font_length = bytestream_get_byte(&tx3g_ptr);
 223
 224         remaining  -= font_length;
 225         if (remaining < 0) {
 226             mov_text_cleanup_ftab(m);
 227             return -1;
 228         }
 229         m->ftab[i].font = av_malloc(font_length + 1);
 230         if (!m->ftab[i].font) {
 231             mov_text_cleanup_ftab(m);
 232             return AVERROR(ENOMEM);
 233         }
 234         bytestream_get_buffer(&tx3g_ptr, m->ftab[i].font, font_length);
 235         m->ftab[i].font[font_length] = '\0';
 236     }
 237     if (j >= 0)
 238         m->d.font = m->ftab[j].font;
 239     return 0;
 240 }
 241
 242 static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, uint64_t size)
 243 {
 244     m->box_flags |= TWRP_BOX;
 245     m->w.wrap_flag = bytestream_get_byte(&tsmb);
 246     return 0;
 247 }
 248
 249 static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, uint64_t size)
 250 {
 251     m->box_flags |= HLIT_BOX;
 252     m->h.hlit_start = bytestream_get_be16(&tsmb);
 253     m->h.hlit_end   = bytestream_get_be16(&tsmb);
 254     return 0;
 255 }
 256
 257 static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, uint64_t size)
 258 {
 259     m->box_flags |= HCLR_BOX;
 260     bytestream_get_buffer(&tsmb, m->c.hlit_color, 4);
 261     return 0;
 262 }
 263
 264 static int styles_equivalent(const StyleBox *a, const StyleBox *b)
 265 {
 266 #define CMP(field) ((a)->field == (b)->field)
 267     return CMP(bold)  && CMP(italic)   && CMP(underline) && CMP(color) &&
 268            CMP(alpha) && CMP(fontsize) && CMP(font_id);
 269 #undef CMP
 270 }
 271
 272 static int decode_styl(const uint8_t *tsmb, MovTextContext *m, uint64_t size)
 273 {
 274     int i;
 275     int style_entries = bytestream_get_be16(&tsmb);
 276     StyleBox *tmp;
 277
 278     // A single style record is of length 12 bytes.
 279     if (2 + style_entries * 12 > size)
 280         return -1;
 281
 282     tmp = av_realloc_array(m->s, style_entries, sizeof(*m->s));
 283     if (!tmp)
 284         return AVERROR(ENOMEM);
 285     m->s             = tmp;
 286     m->style_entries = style_entries;
 287
 288     m->box_flags |= STYL_BOX;
 289     for(i = 0; i < m->style_entries; i++) {
 290         StyleBox *style = &m->s[i];
 291
 292         style->start = bytestream_get_be16(&tsmb);
 293         style->end   = bytestream_get_be16(&tsmb);
 294         if (style->end < style->start ||
 295             (i && style->start < m->s[i - 1].end)) {
 296             mov_text_cleanup(m);
 297             return AVERROR_INVALIDDATA;
 298         }
 299         if (style->start == style->end) {
 300             /* Skip this style as it applies to no character */
 301             tsmb += 8;
 302             m->style_entries--;
 303             i--;
 304             continue;
 305         }
 306
 307         mov_text_parse_style_record(style, &tsmb);
 308         if (styles_equivalent(style, &m->d.style)) {
 309             /* Skip this style as it is equivalent to the default style */
 310             m->style_entries--;
 311             i--;
 312             continue;
 313         } else if (i && style->start == style[-1].end &&
 314                    styles_equivalent(style, &style[-1])) {
 315             /* Merge the two adjacent styles */
 316             style[-1].end = style->end;
 317             m->style_entries--;
 318             i--;
 319             continue;
 320         }
 321     }
 322     return 0;
 323 }
 324
 325 static const Box box_types[] = {
 326     { MKBETAG('s','t','y','l'), 2, decode_styl },
 327     { MKBETAG('h','l','i','t'), 4, decode_hlit },
 328     { MKBETAG('h','c','l','r'), 4, decode_hclr },
 329     { MKBETAG('t','w','r','p'), 1, decode_twrp }
 330 };
 331
 332 const static size_t box_count = FF_ARRAY_ELEMS(box_types);
 333
 334 // Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
 335 static int get_utf8_length_at(const char *text, const char *text_end)
 336 {
 337     const char *start = text;
 338     int err = 0;
 339     uint32_t c;
 340     GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
 341     if (err)
 342         goto error;
 343     return text - start;
 344 error:
 345     return 0;
 346 }
 347
 348 static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
 349                        AVCodecContext *avctx)
 350 {
 351     MovTextContext *m = avctx->priv_data;
 352     const StyleBox *const default_style = &m->d.style;
 353     int i = 0;
 354     int text_pos = 0;
 355     int entry = 0;
 356     int color = default_style->color;
 357
 358     if (text < text_end && m->box_flags & TWRP_BOX) {
 359         if (m->w.wrap_flag == 1) {
 360             av_bprintf(buf, "{\\q1}"); /* End of line wrap */
 361         } else {
 362             av_bprintf(buf, "{\\q2}"); /* No wrap */
 363         }
 364     }
 365
 366     while (text < text_end) {
 367         int len;
 368
 369         if ((m->box_flags & STYL_BOX) && entry < m->style_entries) {
 370             const StyleBox *style = &m->s[entry];
 371             if (text_pos == style->end) {
 372                 av_bprintf(buf, "{\\r}");
 373                 color = default_style->color;
 374                 entry++;
 375                 style++;
 376             }
 377             if (entry < m->style_entries && text_pos == style->start) {
 378                 if (style->bold ^ default_style->bold)
 379                     av_bprintf(buf, "{\\b%d}", style->bold);
 380                 if (style->italic ^ default_style->italic)
 381                     av_bprintf(buf, "{\\i%d}", style->italic);
 382                 if (style->underline ^ default_style->underline)
 383                     av_bprintf(buf, "{\\u%d}", style->underline);
 384                 if (style->fontsize != default_style->fontsize)
 385                     av_bprintf(buf, "{\\fs%d}", style->fontsize);
 386                 if (style->font_id != default_style->font_id)
 387                     for (i = 0; i < m->ftab_entries; i++) {
 388                         if (style->font_id == m->ftab[i].font_id)
 389                             av_bprintf(buf, "{\\fn%s}", m->ftab[i].font);
 390                     }
 391                 if (default_style->color != style->color) {
 392                     color = style->color;
 393                     av_bprintf(buf, "{\\1c&H%X&}", color);
 394                 }
 395                 if (default_style->alpha != style->alpha)
 396                     av_bprintf(buf, "{\\1a&H%02X&}", 255 - style->alpha);
 397             }
 398         }
 399         if (m->box_flags & HLIT_BOX) {
 400             if (text_pos == m->h.hlit_start) {
 401                 /* If hclr box is present, set the secondary color to the color
 402                  * specified. Otherwise, set primary color to white and secondary
 403                  * color to black. These colors will come from TextSampleModifier
 404                  * boxes in future and inverse video technique for highlight will
 405                  * be implemented.
 406                  */
 407                 if (m->box_flags & HCLR_BOX) {
 408                     av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
 409                                 m->c.hlit_color[1], m->c.hlit_color[0]);
 410                 } else {
 411                     av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
 412                 }
 413             }
 414             if (text_pos == m->h.hlit_end) {
 415                 if (m->box_flags & HCLR_BOX) {
 416                     av_bprintf(buf, "{\\2c&H%X&}", default_style->color);
 417                 } else {
 418                     av_bprintf(buf, "{\\1c&H%X&}{\\2c&H%X&}",
 419                                color, default_style->color);
 420                 }
 421             }
 422         }
 423
 424         len = get_utf8_length_at(text, text_end);
 425         if (len < 1) {
 426             av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
 427             len = 1;
 428         }
 429         switch (*text) {
 430         case '\r':
 431             break;
 432         case '\n':
 433             av_bprintf(buf, "\\N");
 434             break;
 435         default:
 436             av_bprint_append_data(buf, text, len);
 437             break;
 438         }
 439         text += len;
 440         text_pos++;
 441     }
 442
 443     return 0;
 444 }
 445
 446 static int mov_text_init(AVCodecContext *avctx) {
 447     /*
 448      * TODO: Handle the default text style.
 449      * NB: Most players ignore styles completely, with the result that
 450      * it's very common to find files where the default style is broken
 451      * and respecting it results in a worse experience than ignoring it.
 452      */
 453     int ret;
 454     MovTextContext *m = avctx->priv_data;
 455     ret = mov_text_tx3g(avctx, m);
 456     if (ret == 0) {
 457         const StyleBox *const default_style = &m->d.style;
 458         if (!m->frame_width || !m->frame_height) {
 459             m->frame_width = ASS_DEFAULT_PLAYRESX;
 460             m->frame_height = ASS_DEFAULT_PLAYRESY;
 461         }
 462         return ff_ass_subtitle_header_full(avctx,
 463                     m->frame_width, m->frame_height,
 464                     m->d.font, default_style->fontsize,
 465                     (255U - default_style->alpha) << 24 | default_style->color,
 466                     (255U - default_style->alpha) << 24 | default_style->color,
 467                     (255U - m->d.back_alpha) << 24 | m->d.back_color,
 468                     (255U - m->d.back_alpha) << 24 | m->d.back_color,
 469                     default_style->bold, default_style->italic, default_style->underline,
 470                     ASS_DEFAULT_BORDERSTYLE, m->d.alignment);
 471     } else
 472         return ff_ass_subtitle_header_default(avctx);
 473 }
 474
 475 static int mov_text_decode_frame(AVCodecContext *avctx, AVSubtitle *sub,
 476                                  int *got_sub_ptr, const AVPacket *avpkt)
 477 {
 478     MovTextContext *m = avctx->priv_data;
 479     int ret;
 480     AVBPrint buf;
 481     const char *ptr = avpkt->data, *end;
 482     int text_length;
 483     size_t i;
 484
 485     if (!ptr || avpkt->size < 2)
 486         return AVERROR_INVALIDDATA;
 487
 488     /*
 489      * A packet of size two with value zero is an empty subtitle
 490      * used to mark the end of the previous non-empty subtitle.
 491      * We can just drop them here as we have duration information
 492      * already. If the value is non-zero, then it's technically a
 493      * bad packet.
 494      */
 495     if (avpkt->size == 2)
 496         return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
 497
 498     /*
 499      * The first two bytes of the packet are the length of the text string
 500      * In complex cases, there are style descriptors appended to the string
 501      * so we can't just assume the packet size is the string size.
 502      */
 503     text_length = AV_RB16(ptr);
 504     end = ptr + FFMIN(2 + text_length, avpkt->size);
 505     ptr += 2;
 506
 507     mov_text_cleanup(m);
 508
 509     m->style_entries = 0;
 510     m->box_flags = 0;
 511     // Note that the spec recommends lines be no longer than 2048 characters.
 512     av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
 513     if (text_length + 2 < avpkt->size) {
 514         const uint8_t *tsmb = end;
 515         const uint8_t *const tsmb_end = avpkt->data + avpkt->size;
 516         // A box is a minimum of 8 bytes.
 517         while (tsmb_end - tsmb >= 8) {
 518             uint64_t tsmb_size = bytestream_get_be32(&tsmb);
 519             uint32_t tsmb_type = bytestream_get_be32(&tsmb);
 520             int size_var, ret_tsmb;
 521
 522             if (tsmb_size == 1) {
 523                 if (tsmb_end - tsmb < 8)
 524                     break;
 525                 tsmb_size = bytestream_get_be64(&tsmb);
 526                 size_var = 16;
 527             } else
 528                 size_var = 8;
 529             //size_var is equal to 8 or 16 depending on the size of box
 530
 531             if (tsmb_size < size_var) {
 532                 av_log(avctx, AV_LOG_ERROR, "tsmb_size invalid\n");
 533                 return AVERROR_INVALIDDATA;
 534             }
 535             tsmb_size -= size_var;
 536
 537             if (tsmb_end - tsmb < tsmb_size)
 538                 break;
 539
 540             for (i = 0; i < box_count; i++) {
 541                 if (tsmb_type == box_types[i].type) {
 542                     if (tsmb_size < box_types[i].base_size)
 543                         break;
 544                     ret_tsmb = box_types[i].decode(tsmb, m, tsmb_size);
 545                     if (ret_tsmb == -1)
 546                         break;
 547                 }
 548             }
 549             tsmb += tsmb_size;
 550         }
 551         text_to_ass(&buf, ptr, end, avctx);
 552         mov_text_cleanup(m);
 553     } else
 554         text_to_ass(&buf, ptr, end, avctx);
 555
 556     ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
 557     av_bprint_finalize(&buf, NULL);
 558     if (ret < 0)
 559         return ret;
 560     *got_sub_ptr = sub->num_rects > 0;
 561     return avpkt->size;
 562 }
 563
 564 static int mov_text_decode_close(AVCodecContext *avctx)
 565 {
 566     MovTextContext *m = avctx->priv_data;
 567     mov_text_cleanup_ftab(m);
 568     mov_text_cleanup(m);
 569     return 0;
 570 }
 571
 572 static void mov_text_flush(AVCodecContext *avctx)
 573 {
 574     MovTextContext *m = avctx->priv_data;
 575     if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
 576         m->readorder = 0;
 577 }
 578
 579 #define OFFSET(x) offsetof(MovTextContext, x)
 580 #define FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
 581 static const AVOption options[] = {
 582     { "width", "Frame width, usually video width", OFFSET(frame_width), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
 583     { "height", "Frame height, usually video height", OFFSET(frame_height), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
 584     { NULL },
 585 };
 586
 587 static const AVClass mov_text_decoder_class = {
 588     .class_name = "MOV text decoder",
 589     .item_name  = av_default_item_name,
 590     .option     = options,
 591     .version    = LIBAVUTIL_VERSION_INT,
 592 };
 593
 594 const FFCodec ff_movtext_decoder = {
 595     .p.name       = "mov_text",
 596     CODEC_LONG_NAME("3GPP Timed Text subtitle"),
 597     .p.type       = AVMEDIA_TYPE_SUBTITLE,
 598     .p.id         = AV_CODEC_ID_MOV_TEXT,
 599     .priv_data_size = sizeof(MovTextContext),
 600     .p.priv_class = &mov_text_decoder_class,
 601     .init         = mov_text_init,
 602     FF_CODEC_DECODE_SUB_CB(mov_text_decode_frame),
 603     .close        = mov_text_decode_close,
 604     .flush        = mov_text_flush,
 605 };