[ie/mlbtv] Fix extractor (#10515)
[yt-dlp.git] / yt_dlp / postprocessor / metadataparser.py
blob1d6054294eaaa4d80374950e099c83f191631768
1 import re
3 from .common import PostProcessor
4 from ..utils import Namespace, filter_dict, function_with_repr
7 class MetadataParserPP(PostProcessor):
8 def __init__(self, downloader, actions):
9 super().__init__(downloader)
10 self._actions = []
11 for f in actions:
12 action, *args = f
13 assert action in self.Actions
14 self._actions.append(action(self, *args))
16 @classmethod
17 def validate_action(cls, action, *data):
18 """Each action can be:
19 (Actions.INTERPRET, from, to) OR
20 (Actions.REPLACE, field, search, replace)
21 """
22 if action not in cls.Actions:
23 raise ValueError(f'{action!r} is not a valid action')
24 action(cls, *data) # So this can raise error to validate
26 @staticmethod
27 def field_to_template(tmpl):
28 if re.match(r'[a-zA-Z_]+$', tmpl):
29 return f'%({tmpl})s'
31 from ..YoutubeDL import YoutubeDL
32 err = YoutubeDL.validate_outtmpl(tmpl)
33 if err:
34 raise err
35 return tmpl
37 @staticmethod
38 def format_to_regex(fmt):
39 r"""
40 Converts a string like
41 '%(title)s - %(artist)s'
42 to a regex like
43 '(?P<title>.+)\ \-\ (?P<artist>.+)'
44 """
45 if not re.search(r'%\(\w+\)s', fmt):
46 return fmt
47 lastpos = 0
48 regex = ''
49 # replace %(..)s with regex group and escape other string parts
50 for match in re.finditer(r'%\((\w+)\)s', fmt):
51 regex += re.escape(fmt[lastpos:match.start()])
52 regex += rf'(?P<{match.group(1)}>.+)'
53 lastpos = match.end()
54 if lastpos < len(fmt):
55 regex += re.escape(fmt[lastpos:])
56 return regex
58 def run(self, info):
59 for f in self._actions:
60 f(info)
61 return [], info
63 @function_with_repr
64 def interpretter(self, inp, out):
65 def f(info):
66 data_to_parse = self._downloader.evaluate_outtmpl(template, info)
67 self.write_debug(f'Searching for {out_re.pattern!r} in {template!r}')
68 match = out_re.search(data_to_parse)
69 if match is None:
70 self.to_screen(f'Could not interpret {inp!r} as {out!r}')
71 return
72 for attribute, value in filter_dict(match.groupdict()).items():
73 info[attribute] = value
74 self.to_screen(f'Parsed {attribute} from {template!r}: {value!r}')
76 template = self.field_to_template(inp)
77 out_re = re.compile(self.format_to_regex(out))
78 return f
80 @function_with_repr
81 def replacer(self, field, search, replace):
82 def f(info):
83 val = info.get(field)
84 if val is None:
85 self.to_screen(f'Video does not have a {field}')
86 return
87 elif not isinstance(val, str):
88 self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
89 return
90 self.write_debug(f'Replacing all {search!r} in {field} with {replace!r}')
91 info[field], n = search_re.subn(replace, val)
92 if n:
93 self.to_screen(f'Changed {field} to: {info[field]}')
94 else:
95 self.to_screen(f'Did not find {search!r} in {field}')
97 search_re = re.compile(search)
98 return f
100 Actions = Namespace(INTERPRET=interpretter, REPLACE=replacer)
103 class MetadataFromFieldPP(MetadataParserPP):
104 @classmethod
105 def to_action(cls, f):
106 match = re.match(r'(?s)(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
107 if match is None:
108 raise ValueError(f'it should be FROM:TO, not {f!r}')
109 return (
110 cls.Actions.INTERPRET,
111 match.group('in').replace('\\:', ':'),
112 match.group('out'),
115 def __init__(self, downloader, formats):
116 super().__init__(downloader, [self.to_action(f) for f in formats])
119 # Deprecated
120 class MetadataFromTitlePP(MetadataParserPP):
121 def __init__(self, downloader, titleformat):
122 super().__init__(downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
123 self.deprecation_warning(
124 'yt_dlp.postprocessor.MetadataFromTitlePP is deprecated '
125 'and may be removed in a future version. Use yt_dlp.postprocessor.MetadataFromFieldPP instead')