yt_dlp/webvtt.py

   1 """
   2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
   3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
   4 timestamps on the way, while everything else is passed through unmodified.
   5
   6 Regular expressions based on the W3C WebVTT specification
   7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
   8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
   9 """
  10
  11 import io
  12 import re
  13
  14 from .utils import int_or_none, timetuple_from_msec
  15
  16
  17 class _MatchParser:
  18     """
  19     An object that maintains the current parsing position and allows
  20     conveniently advancing it as syntax elements are successfully parsed.
  21     """
  22
  23     def __init__(self, string):
  24         self._data = string
  25         self._pos = 0
  26
  27     def match(self, r):
  28         if isinstance(r, re.Pattern):
  29             return r.match(self._data, self._pos)
  30         if isinstance(r, str):
  31             if self._data.startswith(r, self._pos):
  32                 return len(r)
  33             return None
  34         raise ValueError(r)
  35
  36     def advance(self, by):
  37         if by is None:
  38             amt = 0
  39         elif isinstance(by, re.Match):
  40             amt = len(by.group(0))
  41         elif isinstance(by, str):
  42             amt = len(by)
  43         elif isinstance(by, int):
  44             amt = by
  45         else:
  46             raise ValueError(by)
  47         self._pos += amt
  48         return by
  49
  50     def consume(self, r):
  51         return self.advance(self.match(r))
  52
  53     def child(self):
  54         return _MatchChildParser(self)
  55
  56
  57 class _MatchChildParser(_MatchParser):
  58     """
  59     A child parser state, which advances through the same data as
  60     its parent, but has an independent position. This is useful when
  61     advancing through syntax elements we might later want to backtrack
  62     from.
  63     """
  64
  65     def __init__(self, parent):
  66         super().__init__(parent._data)
  67         self.__parent = parent
  68         self._pos = parent._pos
  69
  70     def commit(self):
  71         """
  72         Advance the parent state to the current position of this child state.
  73         """
  74         self.__parent._pos = self._pos
  75         return self.__parent
  76
  77
  78 class ParseError(Exception):
  79     def __init__(self, parser):
  80         data = parser._data[parser._pos:parser._pos + 100]
  81         super().__init__(f'Parse error at position {parser._pos} (near {data!r})')
  82
  83
  84 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
  85 # prescribes that hours must be *2 or more* digits, timestamps with a single
  86 # digit for the hour part has been seen in the wild.
  87 # See https://github.com/yt-dlp/yt-dlp/issues/921
  88 _REGEX_TS = re.compile(r'''(?x)
  89     (?:([0-9]{1,}):)?
  90     ([0-9]{2}):
  91     ([0-9]{2})\.
  92     ([0-9]{3})?
  93 ''')
  94 _REGEX_EOF = re.compile(r'\Z')
  95 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
  96 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
  97 _REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
  98
  99
 100 def _parse_ts(ts):
 101     """
 102     Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
 103     into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
 104     """
 105     return 90 * sum(
 106         int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
 107
 108
 109 def _format_ts(ts):
 110     """
 111     Convert an MPEG PES timestamp into a WebVTT timestamp.
 112     This will lose sub-millisecond precision.
 113     """
 114     return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
 115
 116
 117 class Block:
 118     """
 119     An abstract WebVTT block.
 120     """
 121
 122     def __init__(self, **kwargs):
 123         for key, val in kwargs.items():
 124             setattr(self, key, val)
 125
 126     @classmethod
 127     def parse(cls, parser):
 128         m = parser.match(cls._REGEX)
 129         if not m:
 130             return None
 131         parser.advance(m)
 132         return cls(raw=m.group(0))
 133
 134     def write_into(self, stream):
 135         stream.write(self.raw)
 136
 137
 138 class HeaderBlock(Block):
 139     """
 140     A WebVTT block that may only appear in the header part of the file,
 141     i.e. before any cue blocks.
 142     """
 143     pass
 144
 145
 146 class Magic(HeaderBlock):
 147     _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
 148
 149     # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
 150     # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
 151     # doesn't specify the exact grammar nor where in the WebVTT
 152     # syntax it should be placed; the below has been devised based
 153     # on usage in the wild
 154     #
 155     # And strictly speaking, the presence of this extension violates
 156     # the W3C WebVTT spec. Oh well.
 157
 158     _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
 159     _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
 160     _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
 161     _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
 162
 163     # This was removed from the spec in the 2017 revision;
 164     # the last spec draft to describe this syntax element is
 165     # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
 166     # Nevertheless, YouTube keeps serving those
 167     _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
 168
 169     @classmethod
 170     def __parse_tsmap(cls, parser):
 171         parser = parser.child()
 172
 173         while True:
 174             m = parser.consume(cls._REGEX_TSMAP_LOCAL)
 175             if m:
 176                 m = parser.consume(_REGEX_TS)
 177                 if m is None:
 178                     raise ParseError(parser)
 179                 local = _parse_ts(m)
 180                 if local is None:
 181                     raise ParseError(parser)
 182             else:
 183                 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
 184                 if m:
 185                     mpegts = int_or_none(m.group(1))
 186                     if mpegts is None:
 187                         raise ParseError(parser)
 188                 else:
 189                     raise ParseError(parser)
 190             if parser.consume(cls._REGEX_TSMAP_SEP):
 191                 continue
 192             if parser.consume(_REGEX_NL):
 193                 break
 194             raise ParseError(parser)
 195
 196         parser.commit()
 197         return local, mpegts
 198
 199     @classmethod
 200     def parse(cls, parser):
 201         parser = parser.child()
 202
 203         m = parser.consume(cls._REGEX)
 204         if not m:
 205             raise ParseError(parser)
 206
 207         extra = m.group(1)
 208         local, mpegts, meta = None, None, ''
 209         while not parser.consume(_REGEX_NL):
 210             if parser.consume(cls._REGEX_TSMAP):
 211                 local, mpegts = cls.__parse_tsmap(parser)
 212                 continue
 213             m = parser.consume(cls._REGEX_META)
 214             if m:
 215                 meta += m.group(0)
 216                 continue
 217             raise ParseError(parser)
 218         parser.commit()
 219         return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
 220
 221     def write_into(self, stream):
 222         stream.write('WEBVTT')
 223         if self.extra is not None:
 224             stream.write(self.extra)
 225         stream.write('\n')
 226         if self.local or self.mpegts:
 227             stream.write('X-TIMESTAMP-MAP=LOCAL:')
 228             stream.write(_format_ts(self.local if self.local is not None else 0))
 229             stream.write(',MPEGTS:')
 230             stream.write(str(self.mpegts if self.mpegts is not None else 0))
 231             stream.write('\n')
 232         if self.meta:
 233             stream.write(self.meta)
 234         stream.write('\n')
 235
 236
 237 class StyleBlock(HeaderBlock):
 238     _REGEX = re.compile(r'''(?x)
 239         STYLE[\ \t]*(?:\r\n|[\r\n])
 240         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 241         (?:\r\n|[\r\n])
 242     ''')
 243
 244
 245 class RegionBlock(HeaderBlock):
 246     _REGEX = re.compile(r'''(?x)
 247         REGION[\ \t]*
 248         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 249         (?:\r\n|[\r\n])
 250     ''')
 251
 252
 253 class CommentBlock(Block):
 254     _REGEX = re.compile(r'''(?x)
 255         NOTE(?:\r\n|[\ \t\r\n])
 256         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 257         (?:\r\n|[\r\n])
 258     ''')
 259
 260
 261 class CueBlock(Block):
 262     """
 263     A cue block. The payload is not interpreted.
 264     """
 265
 266     _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
 267     _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
 268     _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
 269     _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
 270
 271     @classmethod
 272     def parse(cls, parser):
 273         parser = parser.child()
 274
 275         id_ = None
 276         m = parser.consume(cls._REGEX_ID)
 277         if m:
 278             id_ = m.group(1)
 279
 280         m0 = parser.consume(_REGEX_TS)
 281         if not m0:
 282             return None
 283         if not parser.consume(cls._REGEX_ARROW):
 284             return None
 285         m1 = parser.consume(_REGEX_TS)
 286         if not m1:
 287             return None
 288         m2 = parser.consume(cls._REGEX_SETTINGS)
 289         parser.consume(_REGEX_OPTIONAL_WHITESPACE)
 290         if not parser.consume(_REGEX_NL):
 291             return None
 292
 293         start = _parse_ts(m0)
 294         end = _parse_ts(m1)
 295         settings = m2.group(1) if m2 is not None else None
 296
 297         text = io.StringIO()
 298         while True:
 299             m = parser.consume(cls._REGEX_PAYLOAD)
 300             if not m:
 301                 break
 302             text.write(m.group(0))
 303
 304         parser.commit()
 305         return cls(
 306             id=id_,
 307             start=start, end=end, settings=settings,
 308             text=text.getvalue(),
 309         )
 310
 311     def write_into(self, stream):
 312         if self.id is not None:
 313             stream.write(self.id)
 314             stream.write('\n')
 315         stream.write(_format_ts(self.start))
 316         stream.write(' --> ')
 317         stream.write(_format_ts(self.end))
 318         if self.settings is not None:
 319             stream.write(' ')
 320             stream.write(self.settings)
 321         stream.write('\n')
 322         stream.write(self.text)
 323         stream.write('\n')
 324
 325     @property
 326     def as_json(self):
 327         return {
 328             'id': self.id,
 329             'start': self.start,
 330             'end': self.end,
 331             'text': self.text,
 332             'settings': self.settings,
 333         }
 334
 335     def __eq__(self, other):
 336         return self.as_json == other.as_json
 337
 338     @classmethod
 339     def from_json(cls, json):
 340         return cls(
 341             id=json['id'],
 342             start=json['start'],
 343             end=json['end'],
 344             text=json['text'],
 345             settings=json['settings'],
 346         )
 347
 348     def hinges(self, other):
 349         if self.text != other.text:
 350             return False
 351         if self.settings != other.settings:
 352             return False
 353         return self.start <= self.end == other.start <= other.end
 354
 355
 356 def parse_fragment(frag_content):
 357     """
 358     A generator that yields (partially) parsed WebVTT blocks when given
 359     a bytes object containing the raw contents of a WebVTT file.
 360     """
 361
 362     parser = _MatchParser(frag_content.decode())
 363
 364     yield Magic.parse(parser)
 365
 366     while not parser.match(_REGEX_EOF):
 367         if parser.consume(_REGEX_BLANK):
 368             continue
 369
 370         block = RegionBlock.parse(parser)
 371         if block:
 372             yield block
 373             continue
 374         block = StyleBlock.parse(parser)
 375         if block:
 376             yield block
 377             continue
 378         block = CommentBlock.parse(parser)
 379         if block:
 380             yield block  # XXX: or skip
 381             continue
 382
 383         break
 384
 385     while not parser.match(_REGEX_EOF):
 386         if parser.consume(_REGEX_BLANK):
 387             continue
 388
 389         block = CommentBlock.parse(parser)
 390         if block:
 391             yield block  # XXX: or skip
 392             continue
 393         block = CueBlock.parse(parser)
 394         if block:
 395             yield block
 396             continue
 397
 398         raise ParseError(parser)