- Got rid of newmodule.c
[python/dscho.git] / Lib / mimetypes.py
blobad13be374edeb75e4f29677bb6a961303e0d55d0
1 """Guess the MIME type of a file.
3 This module defines two useful functions:
5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
9 It also contains the following, for tuning the behavior:
11 Data:
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
19 Functions:
21 init([files]) -- parse a list of files, default knownfiles
22 read_mime_types(file) -- parse one file, return a dictionary or None
23 """
25 import os
26 import posixpath
27 import urllib
29 __all__ = ["guess_type","guess_extension","read_mime_types","init"]
31 knownfiles = [
32 "/usr/local/etc/httpd/conf/mime.types",
33 "/usr/local/lib/netscape/mime.types",
34 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
35 "/usr/local/etc/mime.types", # Apache 1.3
38 inited = 0
41 class MimeTypes:
42 """MIME-types datastore.
44 This datastore can handle information from mime.types-style files
45 and supports basic determination of MIME type from a filename or
46 URL, and can guess a reasonable extension given a MIME type.
47 """
49 def __init__(self, filenames=()):
50 if not inited:
51 init()
52 self.encodings_map = encodings_map.copy()
53 self.suffix_map = suffix_map.copy()
54 self.types_map = types_map.copy()
55 self.common_types = common_types.copy()
56 for name in filenames:
57 self.read(name)
59 def guess_type(self, url, strict=1):
60 """Guess the type of a file based on its URL.
62 Return value is a tuple (type, encoding) where type is None if
63 the type can't be guessed (no or unknown suffix) or a string
64 of the form type/subtype, usable for a MIME Content-type
65 header; and encoding is None for no encoding or the name of
66 the program used to encode (e.g. compress or gzip). The
67 mappings are table driven. Encoding suffixes are case
68 sensitive; type suffixes are first tried case sensitive, then
69 case insensitive.
71 The suffixes .tgz, .taz and .tz (case sensitive!) are all
72 mapped to '.tar.gz'. (This is table-driven too, using the
73 dictionary suffix_map.)
75 Optional `strict' argument when false adds a bunch of commonly found,
76 but non-standard types.
77 """
78 scheme, url = urllib.splittype(url)
79 if scheme == 'data':
80 # syntax of data URLs:
81 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
82 # mediatype := [ type "/" subtype ] *( ";" parameter )
83 # data := *urlchar
84 # parameter := attribute "=" value
85 # type/subtype defaults to "text/plain"
86 comma = url.find(',')
87 if comma < 0:
88 # bad data URL
89 return None, None
90 semi = url.find(';', 0, comma)
91 if semi >= 0:
92 type = url[:semi]
93 else:
94 type = url[:comma]
95 if '=' in type or '/' not in type:
96 type = 'text/plain'
97 return type, None # never compressed, so encoding is None
98 base, ext = posixpath.splitext(url)
99 while ext in self.suffix_map:
100 base, ext = posixpath.splitext(base + self.suffix_map[ext])
101 if ext in self.encodings_map:
102 encoding = self.encodings_map[ext]
103 base, ext = posixpath.splitext(base)
104 else:
105 encoding = None
106 types_map = self.types_map
107 common_types = self.common_types
108 if ext in types_map:
109 return types_map[ext], encoding
110 elif ext.lower() in types_map:
111 return types_map[ext.lower()], encoding
112 elif strict:
113 return None, encoding
114 elif ext in common_types:
115 return common_types[ext], encoding
116 elif ext.lower() in common_types:
117 return common_types[ext.lower()], encoding
118 else:
119 return None, encoding
121 def guess_extension(self, type, strict=1):
122 """Guess the extension for a file based on its MIME type.
124 Return value is a string giving a filename extension,
125 including the leading dot ('.'). The extension is not
126 guaranteed to have been associated with any particular data
127 stream, but would be mapped to the MIME type `type' by
128 guess_type(). If no extension can be guessed for `type', None
129 is returned.
131 Optional `strict' argument when false adds a bunch of commonly found,
132 but non-standard types.
134 type = type.lower()
135 for ext, stype in self.types_map.items():
136 if type == stype:
137 return ext
138 if not strict:
139 for ext, stype in common_types.items():
140 if type == stype:
141 return ext
142 return None
144 def read(self, filename):
145 """Read a single mime.types-format file, specified by pathname."""
146 fp = open(filename)
147 self.readfp(fp)
148 fp.close()
150 def readfp(self, fp):
151 """Read a single mime.types-format file."""
152 map = self.types_map
153 while 1:
154 line = fp.readline()
155 if not line:
156 break
157 words = line.split()
158 for i in range(len(words)):
159 if words[i][0] == '#':
160 del words[i:]
161 break
162 if not words:
163 continue
164 type, suffixes = words[0], words[1:]
165 for suff in suffixes:
166 map['.' + suff] = type
169 def guess_type(url, strict=1):
170 """Guess the type of a file based on its URL.
172 Return value is a tuple (type, encoding) where type is None if the
173 type can't be guessed (no or unknown suffix) or a string of the
174 form type/subtype, usable for a MIME Content-type header; and
175 encoding is None for no encoding or the name of the program used
176 to encode (e.g. compress or gzip). The mappings are table
177 driven. Encoding suffixes are case sensitive; type suffixes are
178 first tried case sensitive, then case insensitive.
180 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
181 to ".tar.gz". (This is table-driven too, using the dictionary
182 suffix_map).
184 Optional `strict' argument when false adds a bunch of commonly found, but
185 non-standard types.
187 init()
188 return guess_type(url, strict)
191 def guess_extension(type, strict=1):
192 """Guess the extension for a file based on its MIME type.
194 Return value is a string giving a filename extension, including the
195 leading dot ('.'). The extension is not guaranteed to have been
196 associated with any particular data stream, but would be mapped to the
197 MIME type `type' by guess_type(). If no extension can be guessed for
198 `type', None is returned.
200 Optional `strict' argument when false adds a bunch of commonly found,
201 but non-standard types.
203 init()
204 return guess_extension(type, strict)
207 def init(files=None):
208 global guess_extension, guess_type
209 global suffix_map, types_map, encodings_map, common_types
210 global inited
211 inited = 1
212 db = MimeTypes()
213 if files is None:
214 files = knownfiles
215 for file in files:
216 if os.path.isfile(file):
217 db.readfp(open(file))
218 encodings_map = db.encodings_map
219 suffix_map = db.suffix_map
220 types_map = db.types_map
221 guess_extension = db.guess_extension
222 guess_type = db.guess_type
223 common_types = db.common_types
226 def read_mime_types(file):
227 try:
228 f = open(file)
229 except IOError:
230 return None
231 db = MimeTypes()
232 db.readfp(f)
233 return db.types_map
236 suffix_map = {
237 '.tgz': '.tar.gz',
238 '.taz': '.tar.gz',
239 '.tz': '.tar.gz',
242 encodings_map = {
243 '.gz': 'gzip',
244 '.Z': 'compress',
247 # Before adding new types, make sure they are either registered with IANA, at
248 # http://www.isi.edu/in-notes/iana/assignments/media-types
249 # or extensions, i.e. using the x- prefix
251 # If you add to these, please keep them sorted!
252 types_map = {
253 '.a' : 'application/octet-stream',
254 '.ai' : 'application/postscript',
255 '.aif' : 'audio/x-aiff',
256 '.aifc' : 'audio/x-aiff',
257 '.aiff' : 'audio/x-aiff',
258 '.au' : 'audio/basic',
259 '.avi' : 'video/x-msvideo',
260 '.bat' : 'text/plain',
261 '.bcpio' : 'application/x-bcpio',
262 '.bin' : 'application/octet-stream',
263 '.bmp' : 'image/x-ms-bmp',
264 '.c' : 'text/plain',
265 # Duplicates :(
266 '.cdf' : 'application/x-cdf',
267 '.cdf' : 'application/x-netcdf',
268 '.cpio' : 'application/x-cpio',
269 '.csh' : 'application/x-csh',
270 '.css' : 'text/css',
271 '.dll' : 'application/octet-stream',
272 '.doc' : 'application/msword',
273 '.dot' : 'application/msword',
274 '.dvi' : 'application/x-dvi',
275 '.eml' : 'message/rfc822',
276 '.eps' : 'application/postscript',
277 '.etx' : 'text/x-setext',
278 '.exe' : 'application/octet-stream',
279 '.gif' : 'image/gif',
280 '.gtar' : 'application/x-gtar',
281 '.h' : 'text/plain',
282 '.hdf' : 'application/x-hdf',
283 '.htm' : 'text/html',
284 '.html' : 'text/html',
285 '.ief' : 'image/ief',
286 '.jpe' : 'image/jpeg',
287 '.jpeg' : 'image/jpeg',
288 '.jpg' : 'image/jpeg',
289 '.js' : 'application/x-javascript',
290 '.ksh' : 'text/plain',
291 '.latex' : 'application/x-latex',
292 '.m1v' : 'video/mpeg',
293 '.man' : 'application/x-troff-man',
294 '.me' : 'application/x-troff-me',
295 '.mht' : 'message/rfc822',
296 '.mhtml' : 'message/rfc822',
297 '.mif' : 'application/x-mif',
298 '.mov' : 'video/quicktime',
299 '.movie' : 'video/x-sgi-movie',
300 '.mp2' : 'audio/mpeg',
301 '.mp3' : 'audio/mpeg',
302 '.mpa' : 'video/mpeg',
303 '.mpe' : 'video/mpeg',
304 '.mpeg' : 'video/mpeg',
305 '.mpg' : 'video/mpeg',
306 '.ms' : 'application/x-troff-ms',
307 '.nc' : 'application/x-netcdf',
308 '.nws' : 'message/rfc822',
309 '.o' : 'application/octet-stream',
310 '.obj' : 'application/octet-stream',
311 '.oda' : 'application/oda',
312 '.p12' : 'application/x-pkcs12',
313 '.p7c' : 'application/pkcs7-mime',
314 '.pbm' : 'image/x-portable-bitmap',
315 '.pdf' : 'application/pdf',
316 '.pfx' : 'application/x-pkcs12',
317 '.pgm' : 'image/x-portable-graymap',
318 '.pl' : 'text/plain',
319 '.png' : 'image/png',
320 '.pnm' : 'image/x-portable-anymap',
321 '.pot' : 'application/vnd.ms-powerpoint',
322 '.ppa' : 'application/vnd.ms-powerpoint',
323 '.ppm' : 'image/x-portable-pixmap',
324 '.pps' : 'application/vnd.ms-powerpoint',
325 '.ppt' : 'application/vnd.ms-powerpoint',
326 '.ps' : 'application/postscript',
327 '.pwz' : 'application/vnd.ms-powerpoint',
328 '.py' : 'text/x-python',
329 '.pyc' : 'application/x-python-code',
330 '.pyo' : 'application/x-python-code',
331 '.qt' : 'video/quicktime',
332 '.ra' : 'audio/x-pn-realaudio',
333 '.ram' : 'application/x-pn-realaudio',
334 '.ras' : 'image/x-cmu-raster',
335 '.rdf' : 'application/xml',
336 '.rgb' : 'image/x-rgb',
337 '.roff' : 'application/x-troff',
338 '.rtx' : 'text/richtext',
339 '.sgm' : 'text/x-sgml',
340 '.sgml' : 'text/x-sgml',
341 '.sh' : 'application/x-sh',
342 '.shar' : 'application/x-shar',
343 '.snd' : 'audio/basic',
344 '.so' : 'application/octet-stream',
345 '.src' : 'application/x-wais-source',
346 '.sv4cpio': 'application/x-sv4cpio',
347 '.sv4crc' : 'application/x-sv4crc',
348 '.t' : 'application/x-troff',
349 '.tar' : 'application/x-tar',
350 '.tcl' : 'application/x-tcl',
351 '.tex' : 'application/x-tex',
352 '.texi' : 'application/x-texinfo',
353 '.texinfo': 'application/x-texinfo',
354 '.tif' : 'image/tiff',
355 '.tiff' : 'image/tiff',
356 '.tr' : 'application/x-troff',
357 '.tsv' : 'text/tab-separated-values',
358 '.txt' : 'text/plain',
359 '.ustar' : 'application/x-ustar',
360 '.vcf' : 'text/x-vcard',
361 '.wav' : 'audio/x-wav',
362 '.wiz' : 'application/msword',
363 '.xbm' : 'image/x-xbitmap',
364 '.xlb' : 'application/vnd.ms-excel',
365 # Duplicates :(
366 '.xls' : 'application/excel',
367 '.xls' : 'application/vnd.ms-excel',
368 '.xml' : 'text/xml',
369 '.xpm' : 'image/x-xpixmap',
370 '.xsl' : 'application/xml',
371 '.xwd' : 'image/x-xwindowdump',
372 '.zip' : 'application/zip',
375 # These are non-standard types, commonly found in the wild. They will only
376 # match if strict=0 flag is given to the API methods.
378 # Please sort these too
379 common_types = {
380 '.jpg' : 'image/jpg',
381 '.mid' : 'audio/midi',
382 '.midi': 'audio/midi',
383 '.pct' : 'image/pict',
384 '.pic' : 'image/pict',
385 '.pict': 'image/pict',
386 '.rtf' : 'application/rtf',
387 '.xul' : 'text/xul'
391 if __name__ == '__main__':
392 import sys
393 import getopt
395 USAGE = """\
396 Usage: mimetypes.py [options] type
398 Options:
399 --help / -h -- print this message and exit
400 --lenient / -l -- additionally search of some common, but non-standard
401 types.
402 --extension / -e -- guess extension instead of type
404 More than one type argument may be given.
407 def usage(code, msg=''):
408 print USAGE
409 if msg: print msg
410 sys.exit(code)
412 try:
413 opts, args = getopt.getopt(sys.argv[1:], 'hle',
414 ['help', 'lenient', 'extension'])
415 except getopt.error, msg:
416 usage(1, msg)
418 strict = 1
419 extension = 0
420 for opt, arg in opts:
421 if opt in ('-h', '--help'):
422 usage(0)
423 elif opt in ('-l', '--lenient'):
424 strict = 0
425 elif opt in ('-e', '--extension'):
426 extension = 1
427 for gtype in args:
428 if extension:
429 guess = guess_extension(gtype, strict)
430 if not guess: print "I don't know anything about type", gtype
431 else: print guess
432 else:
433 guess, encoding = guess_type(gtype, strict)
434 if not guess: print "I don't know anything about type", gtype
435 else: print 'type:', guess, 'encoding:', encoding