[ie/dailymotion] Improve `_VALID_URL` (#7692)
[yt-dlp3.git] / yt_dlp / webvtt.py
blobdd72982778c275688398644794f303f538ea332a
1 """
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9 """
11 import io
12 import re
14 from .utils import int_or_none, timetuple_from_msec
17 class _MatchParser:
18 """
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
21 """
23 def __init__(self, string):
24 self._data = string
25 self._pos = 0
27 def match(self, r):
28 if isinstance(r, re.Pattern):
29 return r.match(self._data, self._pos)
30 if isinstance(r, str):
31 if self._data.startswith(r, self._pos):
32 return len(r)
33 return None
34 raise ValueError(r)
36 def advance(self, by):
37 if by is None:
38 amt = 0
39 elif isinstance(by, re.Match):
40 amt = len(by.group(0))
41 elif isinstance(by, str):
42 amt = len(by)
43 elif isinstance(by, int):
44 amt = by
45 else:
46 raise ValueError(by)
47 self._pos += amt
48 return by
50 def consume(self, r):
51 return self.advance(self.match(r))
53 def child(self):
54 return _MatchChildParser(self)
57 class _MatchChildParser(_MatchParser):
58 """
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
62 from.
63 """
65 def __init__(self, parent):
66 super().__init__(parent._data)
67 self.__parent = parent
68 self._pos = parent._pos
70 def commit(self):
71 """
72 Advance the parent state to the current position of this child state.
73 """
74 self.__parent._pos = self._pos
75 return self.__parent
78 class ParseError(Exception):
79 def __init__(self, parser):
80 super().__init__("Parse error at position %u (near %r)" % (
81 parser._pos, parser._data[parser._pos:parser._pos + 20]
85 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86 # prescribes that hours must be *2 or more* digits, timestamps with a single
87 # digit for the hour part has been seen in the wild.
88 # See https://github.com/yt-dlp/yt-dlp/issues/921
89 _REGEX_TS = re.compile(r'''(?x)
90 (?:([0-9]{1,}):)?
91 ([0-9]{2}):
92 ([0-9]{2})\.
93 ([0-9]{3})?
94 ''')
95 _REGEX_EOF = re.compile(r'\Z')
96 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
97 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
100 def _parse_ts(ts):
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
105 return 90 * sum(
106 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
109 def _format_ts(ts):
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
117 class Block:
119 An abstract WebVTT block.
122 def __init__(self, **kwargs):
123 for key, val in kwargs.items():
124 setattr(self, key, val)
126 @classmethod
127 def parse(cls, parser):
128 m = parser.match(cls._REGEX)
129 if not m:
130 return None
131 parser.advance(m)
132 return cls(raw=m.group(0))
134 def write_into(self, stream):
135 stream.write(self.raw)
138 class HeaderBlock(Block):
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
143 pass
146 class Magic(HeaderBlock):
147 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
149 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
150 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
151 # doesn’t specify the exact grammar nor where in the WebVTT
152 # syntax it should be placed; the below has been devised based
153 # on usage in the wild
155 # And strictly speaking, the presence of this extension violates
156 # the W3C WebVTT spec. Oh well.
158 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
159 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
160 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
161 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
163 # This was removed from the spec in the 2017 revision;
164 # the last spec draft to describe this syntax element is
165 # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
166 # Nevertheless, YouTube keeps serving those
167 _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
169 @classmethod
170 def __parse_tsmap(cls, parser):
171 parser = parser.child()
173 while True:
174 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
175 if m:
176 m = parser.consume(_REGEX_TS)
177 if m is None:
178 raise ParseError(parser)
179 local = _parse_ts(m)
180 if local is None:
181 raise ParseError(parser)
182 else:
183 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
184 if m:
185 mpegts = int_or_none(m.group(1))
186 if mpegts is None:
187 raise ParseError(parser)
188 else:
189 raise ParseError(parser)
190 if parser.consume(cls._REGEX_TSMAP_SEP):
191 continue
192 if parser.consume(_REGEX_NL):
193 break
194 raise ParseError(parser)
196 parser.commit()
197 return local, mpegts
199 @classmethod
200 def parse(cls, parser):
201 parser = parser.child()
203 m = parser.consume(cls._REGEX)
204 if not m:
205 raise ParseError(parser)
207 extra = m.group(1)
208 local, mpegts, meta = None, None, ''
209 while not parser.consume(_REGEX_NL):
210 if parser.consume(cls._REGEX_TSMAP):
211 local, mpegts = cls.__parse_tsmap(parser)
212 continue
213 m = parser.consume(cls._REGEX_META)
214 if m:
215 meta += m.group(0)
216 continue
217 raise ParseError(parser)
218 parser.commit()
219 return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
221 def write_into(self, stream):
222 stream.write('WEBVTT')
223 if self.extra is not None:
224 stream.write(self.extra)
225 stream.write('\n')
226 if self.local or self.mpegts:
227 stream.write('X-TIMESTAMP-MAP=LOCAL:')
228 stream.write(_format_ts(self.local if self.local is not None else 0))
229 stream.write(',MPEGTS:')
230 stream.write(str(self.mpegts if self.mpegts is not None else 0))
231 stream.write('\n')
232 if self.meta:
233 stream.write(self.meta)
234 stream.write('\n')
237 class StyleBlock(HeaderBlock):
238 _REGEX = re.compile(r'''(?x)
239 STYLE[\ \t]*(?:\r\n|[\r\n])
240 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
241 (?:\r\n|[\r\n])
242 ''')
245 class RegionBlock(HeaderBlock):
246 _REGEX = re.compile(r'''(?x)
247 REGION[\ \t]*
248 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
249 (?:\r\n|[\r\n])
250 ''')
253 class CommentBlock(Block):
254 _REGEX = re.compile(r'''(?x)
255 NOTE(?:\r\n|[\ \t\r\n])
256 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
257 (?:\r\n|[\r\n])
258 ''')
261 class CueBlock(Block):
263 A cue block. The payload is not interpreted.
266 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
267 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
268 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
269 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
271 @classmethod
272 def parse(cls, parser):
273 parser = parser.child()
275 id = None
276 m = parser.consume(cls._REGEX_ID)
277 if m:
278 id = m.group(1)
280 m0 = parser.consume(_REGEX_TS)
281 if not m0:
282 return None
283 if not parser.consume(cls._REGEX_ARROW):
284 return None
285 m1 = parser.consume(_REGEX_TS)
286 if not m1:
287 return None
288 m2 = parser.consume(cls._REGEX_SETTINGS)
289 if not parser.consume(_REGEX_NL):
290 return None
292 start = _parse_ts(m0)
293 end = _parse_ts(m1)
294 settings = m2.group(1) if m2 is not None else None
296 text = io.StringIO()
297 while True:
298 m = parser.consume(cls._REGEX_PAYLOAD)
299 if not m:
300 break
301 text.write(m.group(0))
303 parser.commit()
304 return cls(
305 id=id,
306 start=start, end=end, settings=settings,
307 text=text.getvalue()
310 def write_into(self, stream):
311 if self.id is not None:
312 stream.write(self.id)
313 stream.write('\n')
314 stream.write(_format_ts(self.start))
315 stream.write(' --> ')
316 stream.write(_format_ts(self.end))
317 if self.settings is not None:
318 stream.write(' ')
319 stream.write(self.settings)
320 stream.write('\n')
321 stream.write(self.text)
322 stream.write('\n')
324 @property
325 def as_json(self):
326 return {
327 'id': self.id,
328 'start': self.start,
329 'end': self.end,
330 'text': self.text,
331 'settings': self.settings,
334 def __eq__(self, other):
335 return self.as_json == other.as_json
337 @classmethod
338 def from_json(cls, json):
339 return cls(
340 id=json['id'],
341 start=json['start'],
342 end=json['end'],
343 text=json['text'],
344 settings=json['settings']
347 def hinges(self, other):
348 if self.text != other.text:
349 return False
350 if self.settings != other.settings:
351 return False
352 return self.start <= self.end == other.start <= other.end
355 def parse_fragment(frag_content):
357 A generator that yields (partially) parsed WebVTT blocks when given
358 a bytes object containing the raw contents of a WebVTT file.
361 parser = _MatchParser(frag_content.decode())
363 yield Magic.parse(parser)
365 while not parser.match(_REGEX_EOF):
366 if parser.consume(_REGEX_BLANK):
367 continue
369 block = RegionBlock.parse(parser)
370 if block:
371 yield block
372 continue
373 block = StyleBlock.parse(parser)
374 if block:
375 yield block
376 continue
377 block = CommentBlock.parse(parser)
378 if block:
379 yield block # XXX: or skip
380 continue
382 break
384 while not parser.match(_REGEX_EOF):
385 if parser.consume(_REGEX_BLANK):
386 continue
388 block = CommentBlock.parse(parser)
389 if block:
390 yield block # XXX: or skip
391 continue
392 block = CueBlock.parse(parser)
393 if block:
394 yield block
395 continue
397 raise ParseError(parser)