2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
14 from .utils
import int_or_none
, timetuple_from_msec
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
23 def __init__(self
, string
):
28 if isinstance(r
, re
.Pattern
):
29 return r
.match(self
._data
, self
._pos
)
30 if isinstance(r
, str):
31 if self
._data
.startswith(r
, self
._pos
):
36 def advance(self
, by
):
39 elif isinstance(by
, re
.Match
):
40 amt
= len(by
.group(0))
41 elif isinstance(by
, str):
43 elif isinstance(by
, int):
51 return self
.advance(self
.match(r
))
54 return _MatchChildParser(self
)
57 class _MatchChildParser(_MatchParser
):
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
65 def __init__(self
, parent
):
66 super().__init
__(parent
._data
)
67 self
.__parent
= parent
68 self
._pos
= parent
._pos
72 Advance the parent state to the current position of this child state.
74 self
.__parent
._pos
= self
._pos
78 class ParseError(Exception):
79 def __init__(self
, parser
):
80 super().__init
__("Parse error at position %u (near %r)" % (
81 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 100]
85 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86 # prescribes that hours must be *2 or more* digits, timestamps with a single
87 # digit for the hour part has been seen in the wild.
88 # See https://github.com/yt-dlp/yt-dlp/issues/921
89 _REGEX_TS
= re
.compile(r
'''(?x)
95 _REGEX_EOF
= re
.compile(r
'\Z')
96 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n]|$)')
97 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
98 _REGEX_OPTIONAL_WHITESPACE
= re
.compile(r
'[ \t]*')
103 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
104 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
107 int(part
or 0) * mult
for part
, mult
in zip(ts
.groups(), (3600_000, 60_000, 1000, 1)))
112 Convert an MPEG PES timestamp into a WebVTT timestamp.
113 This will lose sub-millisecond precision.
115 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
120 An abstract WebVTT block.
123 def __init__(self
, **kwargs
):
124 for key
, val
in kwargs
.items():
125 setattr(self
, key
, val
)
128 def parse(cls
, parser
):
129 m
= parser
.match(cls
._REGEX
)
133 return cls(raw
=m
.group(0))
135 def write_into(self
, stream
):
136 stream
.write(self
.raw
)
139 class HeaderBlock(Block
):
141 A WebVTT block that may only appear in the header part of the file,
142 i.e. before any cue blocks.
147 class Magic(HeaderBlock
):
148 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
150 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
151 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
152 # doesn’t specify the exact grammar nor where in the WebVTT
153 # syntax it should be placed; the below has been devised based
154 # on usage in the wild
156 # And strictly speaking, the presence of this extension violates
157 # the W3C WebVTT spec. Oh well.
159 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
160 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
161 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
162 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
164 # This was removed from the spec in the 2017 revision;
165 # the last spec draft to describe this syntax element is
166 # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
167 # Nevertheless, YouTube keeps serving those
168 _REGEX_META
= re
.compile(r
'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
171 def __parse_tsmap(cls
, parser
):
172 parser
= parser
.child()
175 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
177 m
= parser
.consume(_REGEX_TS
)
179 raise ParseError(parser
)
182 raise ParseError(parser
)
184 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
186 mpegts
= int_or_none(m
.group(1))
188 raise ParseError(parser
)
190 raise ParseError(parser
)
191 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
193 if parser
.consume(_REGEX_NL
):
195 raise ParseError(parser
)
201 def parse(cls
, parser
):
202 parser
= parser
.child()
204 m
= parser
.consume(cls
._REGEX
)
206 raise ParseError(parser
)
209 local
, mpegts
, meta
= None, None, ''
210 while not parser
.consume(_REGEX_NL
):
211 if parser
.consume(cls
._REGEX
_TSMAP
):
212 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
214 m
= parser
.consume(cls
._REGEX
_META
)
218 raise ParseError(parser
)
220 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
, meta
=meta
)
222 def write_into(self
, stream
):
223 stream
.write('WEBVTT')
224 if self
.extra
is not None:
225 stream
.write(self
.extra
)
227 if self
.local
or self
.mpegts
:
228 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
229 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
230 stream
.write(',MPEGTS:')
231 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
234 stream
.write(self
.meta
)
238 class StyleBlock(HeaderBlock
):
239 _REGEX
= re
.compile(r
'''(?x)
240 STYLE[\ \t]*(?:\r\n|[\r\n])
241 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
246 class RegionBlock(HeaderBlock
):
247 _REGEX
= re
.compile(r
'''(?x)
249 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
254 class CommentBlock(Block
):
255 _REGEX
= re
.compile(r
'''(?x)
256 NOTE(?:\r\n|[\ \t\r\n])
257 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
262 class CueBlock(Block
):
264 A cue block. The payload is not interpreted.
267 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
268 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
269 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
270 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
273 def parse(cls
, parser
):
274 parser
= parser
.child()
277 m
= parser
.consume(cls
._REGEX
_ID
)
281 m0
= parser
.consume(_REGEX_TS
)
284 if not parser
.consume(cls
._REGEX
_ARROW
):
286 m1
= parser
.consume(_REGEX_TS
)
289 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
290 parser
.consume(_REGEX_OPTIONAL_WHITESPACE
)
291 if not parser
.consume(_REGEX_NL
):
294 start
= _parse_ts(m0
)
296 settings
= m2
.group(1) if m2
is not None else None
300 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
303 text
.write(m
.group(0))
308 start
=start
, end
=end
, settings
=settings
,
312 def write_into(self
, stream
):
313 if self
.id is not None:
314 stream
.write(self
.id)
316 stream
.write(_format_ts(self
.start
))
317 stream
.write(' --> ')
318 stream
.write(_format_ts(self
.end
))
319 if self
.settings
is not None:
321 stream
.write(self
.settings
)
323 stream
.write(self
.text
)
333 'settings': self
.settings
,
336 def __eq__(self
, other
):
337 return self
.as_json
== other
.as_json
340 def from_json(cls
, json
):
346 settings
=json
['settings']
349 def hinges(self
, other
):
350 if self
.text
!= other
.text
:
352 if self
.settings
!= other
.settings
:
354 return self
.start
<= self
.end
== other
.start
<= other
.end
357 def parse_fragment(frag_content
):
359 A generator that yields (partially) parsed WebVTT blocks when given
360 a bytes object containing the raw contents of a WebVTT file.
363 parser
= _MatchParser(frag_content
.decode())
365 yield Magic
.parse(parser
)
367 while not parser
.match(_REGEX_EOF
):
368 if parser
.consume(_REGEX_BLANK
):
371 block
= RegionBlock
.parse(parser
)
375 block
= StyleBlock
.parse(parser
)
379 block
= CommentBlock
.parse(parser
)
381 yield block
# XXX: or skip
386 while not parser
.match(_REGEX_EOF
):
387 if parser
.consume(_REGEX_BLANK
):
390 block
= CommentBlock
.parse(parser
)
392 yield block
# XXX: or skip
394 block
= CueBlock
.parse(parser
)
399 raise ParseError(parser
)