2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
14 from .utils
import int_or_none
, timetuple_from_msec
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
23 def __init__(self
, string
):
28 if isinstance(r
, re
.Pattern
):
29 return r
.match(self
._data
, self
._pos
)
30 if isinstance(r
, str):
31 if self
._data
.startswith(r
, self
._pos
):
36 def advance(self
, by
):
39 elif isinstance(by
, re
.Match
):
40 amt
= len(by
.group(0))
41 elif isinstance(by
, str):
43 elif isinstance(by
, int):
51 return self
.advance(self
.match(r
))
54 return _MatchChildParser(self
)
57 class _MatchChildParser(_MatchParser
):
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
65 def __init__(self
, parent
):
66 super().__init
__(parent
._data
)
67 self
.__parent
= parent
68 self
._pos
= parent
._pos
72 Advance the parent state to the current position of this child state.
74 self
.__parent
._pos
= self
._pos
78 class ParseError(Exception):
79 def __init__(self
, parser
):
80 data
= parser
._data
[parser
._pos
:parser
._pos
+ 100]
81 super().__init
__(f
'Parse error at position {parser._pos} (near {data!r})')
84 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
85 # prescribes that hours must be *2 or more* digits, timestamps with a single
86 # digit for the hour part has been seen in the wild.
87 # See https://github.com/yt-dlp/yt-dlp/issues/921
88 _REGEX_TS
= re
.compile(r
'''(?x)
94 _REGEX_EOF
= re
.compile(r
'\Z')
95 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n]|$)')
96 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
97 _REGEX_OPTIONAL_WHITESPACE
= re
.compile(r
'[ \t]*')
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
106 int(part
or 0) * mult
for part
, mult
in zip(ts
.groups(), (3600_000, 60_000, 1000, 1)))
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
119 An abstract WebVTT block.
122 def __init__(self
, **kwargs
):
123 for key
, val
in kwargs
.items():
124 setattr(self
, key
, val
)
127 def parse(cls
, parser
):
128 m
= parser
.match(cls
._REGEX
)
132 return cls(raw
=m
.group(0))
134 def write_into(self
, stream
):
135 stream
.write(self
.raw
)
138 class HeaderBlock(Block
):
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
146 class Magic(HeaderBlock
):
147 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
149 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
150 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
151 # doesn't specify the exact grammar nor where in the WebVTT
152 # syntax it should be placed; the below has been devised based
153 # on usage in the wild
155 # And strictly speaking, the presence of this extension violates
156 # the W3C WebVTT spec. Oh well.
158 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
159 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
160 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
161 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
163 # This was removed from the spec in the 2017 revision;
164 # the last spec draft to describe this syntax element is
165 # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
166 # Nevertheless, YouTube keeps serving those
167 _REGEX_META
= re
.compile(r
'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
170 def __parse_tsmap(cls
, parser
):
171 parser
= parser
.child()
174 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
176 m
= parser
.consume(_REGEX_TS
)
178 raise ParseError(parser
)
181 raise ParseError(parser
)
183 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
185 mpegts
= int_or_none(m
.group(1))
187 raise ParseError(parser
)
189 raise ParseError(parser
)
190 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
192 if parser
.consume(_REGEX_NL
):
194 raise ParseError(parser
)
200 def parse(cls
, parser
):
201 parser
= parser
.child()
203 m
= parser
.consume(cls
._REGEX
)
205 raise ParseError(parser
)
208 local
, mpegts
, meta
= None, None, ''
209 while not parser
.consume(_REGEX_NL
):
210 if parser
.consume(cls
._REGEX
_TSMAP
):
211 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
213 m
= parser
.consume(cls
._REGEX
_META
)
217 raise ParseError(parser
)
219 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
, meta
=meta
)
221 def write_into(self
, stream
):
222 stream
.write('WEBVTT')
223 if self
.extra
is not None:
224 stream
.write(self
.extra
)
226 if self
.local
or self
.mpegts
:
227 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
228 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
229 stream
.write(',MPEGTS:')
230 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
233 stream
.write(self
.meta
)
237 class StyleBlock(HeaderBlock
):
238 _REGEX
= re
.compile(r
'''(?x)
239 STYLE[\ \t]*(?:\r\n|[\r\n])
240 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
245 class RegionBlock(HeaderBlock
):
246 _REGEX
= re
.compile(r
'''(?x)
248 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
253 class CommentBlock(Block
):
254 _REGEX
= re
.compile(r
'''(?x)
255 NOTE(?:\r\n|[\ \t\r\n])
256 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
261 class CueBlock(Block
):
263 A cue block. The payload is not interpreted.
266 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
267 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
268 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
269 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
272 def parse(cls
, parser
):
273 parser
= parser
.child()
276 m
= parser
.consume(cls
._REGEX
_ID
)
280 m0
= parser
.consume(_REGEX_TS
)
283 if not parser
.consume(cls
._REGEX
_ARROW
):
285 m1
= parser
.consume(_REGEX_TS
)
288 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
289 parser
.consume(_REGEX_OPTIONAL_WHITESPACE
)
290 if not parser
.consume(_REGEX_NL
):
293 start
= _parse_ts(m0
)
295 settings
= m2
.group(1) if m2
is not None else None
299 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
302 text
.write(m
.group(0))
307 start
=start
, end
=end
, settings
=settings
,
308 text
=text
.getvalue(),
311 def write_into(self
, stream
):
312 if self
.id is not None:
313 stream
.write(self
.id)
315 stream
.write(_format_ts(self
.start
))
316 stream
.write(' --> ')
317 stream
.write(_format_ts(self
.end
))
318 if self
.settings
is not None:
320 stream
.write(self
.settings
)
322 stream
.write(self
.text
)
332 'settings': self
.settings
,
335 def __eq__(self
, other
):
336 return self
.as_json
== other
.as_json
339 def from_json(cls
, json
):
345 settings
=json
['settings'],
348 def hinges(self
, other
):
349 if self
.text
!= other
.text
:
351 if self
.settings
!= other
.settings
:
353 return self
.start
<= self
.end
== other
.start
<= other
.end
356 def parse_fragment(frag_content
):
358 A generator that yields (partially) parsed WebVTT blocks when given
359 a bytes object containing the raw contents of a WebVTT file.
362 parser
= _MatchParser(frag_content
.decode())
364 yield Magic
.parse(parser
)
366 while not parser
.match(_REGEX_EOF
):
367 if parser
.consume(_REGEX_BLANK
):
370 block
= RegionBlock
.parse(parser
)
374 block
= StyleBlock
.parse(parser
)
378 block
= CommentBlock
.parse(parser
)
380 yield block
# XXX: or skip
385 while not parser
.match(_REGEX_EOF
):
386 if parser
.consume(_REGEX_BLANK
):
389 block
= CommentBlock
.parse(parser
)
391 yield block
# XXX: or skip
393 block
= CueBlock
.parse(parser
)
398 raise ParseError(parser
)