More installation info. Bump alpha version.
[python/dscho.git] / Lib / mimetypes.py
blob846d56b2cd5f3fb754d86740dd51cca7c92c7c69
1 """Guess the MIME type of a file.
3 This module defines two useful functions:
5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
9 It also contains the following, for tuning the behavior:
11 Data:
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
19 Functions:
21 init([files]) -- parse a list of files, default knownfiles
22 read_mime_types(file) -- parse one file, return a dictionary or None
23 """
25 import os
26 import posixpath
27 import urllib
29 __all__ = [
30 "guess_type","guess_extension","guess_all_extensions",
31 "add_type","read_mime_types","init"
34 knownfiles = [
35 "/etc/mime.types",
36 "/usr/local/etc/httpd/conf/mime.types",
37 "/usr/local/lib/netscape/mime.types",
38 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
39 "/usr/local/etc/mime.types", # Apache 1.3
42 inited = False
45 class MimeTypes:
46 """MIME-types datastore.
48 This datastore can handle information from mime.types-style files
49 and supports basic determination of MIME type from a filename or
50 URL, and can guess a reasonable extension given a MIME type.
51 """
53 def __init__(self, filenames=(), strict=True):
54 if not inited:
55 init()
56 self.encodings_map = encodings_map.copy()
57 self.suffix_map = suffix_map.copy()
58 self.types_map = ({}, {}) # dict for (non-strict, strict)
59 self.types_map_inv = ({}, {})
60 for (ext, type) in types_map.items():
61 self.add_type(type, ext, True)
62 for (ext, type) in common_types.items():
63 self.add_type(type, ext, False)
64 for name in filenames:
65 self.read(name, strict)
67 def add_type(self, type, ext, strict=True):
68 """Add a mapping between a type and and extension.
70 When the extension is already known, the new
71 type will replace the old one. When the type
72 is already known the extension will be added
73 to the list of known extensions.
75 If strict is true, information will be added to
76 list of standard types, else to the list of non-standard
77 types.
78 """
79 self.types_map[strict][ext] = type
80 exts = self.types_map_inv[strict].setdefault(type, [])
81 if ext not in exts:
82 exts.append(ext)
84 def guess_type(self, url, strict=True):
85 """Guess the type of a file based on its URL.
87 Return value is a tuple (type, encoding) where type is None if
88 the type can't be guessed (no or unknown suffix) or a string
89 of the form type/subtype, usable for a MIME Content-type
90 header; and encoding is None for no encoding or the name of
91 the program used to encode (e.g. compress or gzip). The
92 mappings are table driven. Encoding suffixes are case
93 sensitive; type suffixes are first tried case sensitive, then
94 case insensitive.
96 The suffixes .tgz, .taz and .tz (case sensitive!) are all
97 mapped to '.tar.gz'. (This is table-driven too, using the
98 dictionary suffix_map.)
100 Optional `strict' argument when False adds a bunch of commonly found,
101 but non-standard types.
103 scheme, url = urllib.splittype(url)
104 if scheme == 'data':
105 # syntax of data URLs:
106 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
107 # mediatype := [ type "/" subtype ] *( ";" parameter )
108 # data := *urlchar
109 # parameter := attribute "=" value
110 # type/subtype defaults to "text/plain"
111 comma = url.find(',')
112 if comma < 0:
113 # bad data URL
114 return None, None
115 semi = url.find(';', 0, comma)
116 if semi >= 0:
117 type = url[:semi]
118 else:
119 type = url[:comma]
120 if '=' in type or '/' not in type:
121 type = 'text/plain'
122 return type, None # never compressed, so encoding is None
123 base, ext = posixpath.splitext(url)
124 while ext in self.suffix_map:
125 base, ext = posixpath.splitext(base + self.suffix_map[ext])
126 if ext in self.encodings_map:
127 encoding = self.encodings_map[ext]
128 base, ext = posixpath.splitext(base)
129 else:
130 encoding = None
131 types_map = self.types_map[True]
132 if ext in types_map:
133 return types_map[ext], encoding
134 elif ext.lower() in types_map:
135 return types_map[ext.lower()], encoding
136 elif strict:
137 return None, encoding
138 types_map = self.types_map[False]
139 if ext in types_map:
140 return types_map[ext], encoding
141 elif ext.lower() in types_map:
142 return types_map[ext.lower()], encoding
143 else:
144 return None, encoding
146 def guess_all_extensions(self, type, strict=True):
147 """Guess the extensions for a file based on its MIME type.
149 Return value is a list of strings giving the possible filename
150 extensions, including the leading dot ('.'). The extension is not
151 guaranteed to have been associated with any particular data
152 stream, but would be mapped to the MIME type `type' by
153 guess_type(). If no extension can be guessed for `type', None
154 is returned.
156 Optional `strict' argument when false adds a bunch of commonly found,
157 but non-standard types.
159 type = type.lower()
160 extensions = self.types_map_inv[True].get(type, [])
161 if not strict:
162 for ext in self.types_map_inv[False].get(type, []):
163 if ext not in extensions:
164 extensions.append(ext)
165 if len(extensions):
166 return extensions
168 def guess_extension(self, type, strict=True):
169 """Guess the extension for a file based on its MIME type.
171 Return value is a string giving a filename extension,
172 including the leading dot ('.'). The extension is not
173 guaranteed to have been associated with any particular data
174 stream, but would be mapped to the MIME type `type' by
175 guess_type(). If no extension can be guessed for `type', None
176 is returned.
178 Optional `strict' argument when false adds a bunch of commonly found,
179 but non-standard types.
181 extensions = self.guess_all_extensions(type, strict)
182 if extensions is not None:
183 extensions = extensions[0]
184 return extensions
186 def read(self, filename, strict=True):
188 Read a single mime.types-format file, specified by pathname.
190 If strict is true, information will be added to
191 list of standard types, else to the list of non-standard
192 types.
194 fp = open(filename)
195 self.readfp(fp)
196 fp.close()
198 def readfp(self, fp, strict=True):
200 Read a single mime.types-format file.
202 If strict is true, information will be added to
203 list of standard types, else to the list of non-standard
204 types.
206 while 1:
207 line = fp.readline()
208 if not line:
209 break
210 words = line.split()
211 for i in range(len(words)):
212 if words[i][0] == '#':
213 del words[i:]
214 break
215 if not words:
216 continue
217 type, suffixes = words[0], words[1:]
218 suffixes = [ '.' + suff for suff in suffixes ]
219 for suff in suffixes:
220 self.add_type(type, suff, strict)
222 def guess_type(url, strict=True):
223 """Guess the type of a file based on its URL.
225 Return value is a tuple (type, encoding) where type is None if the
226 type can't be guessed (no or unknown suffix) or a string of the
227 form type/subtype, usable for a MIME Content-type header; and
228 encoding is None for no encoding or the name of the program used
229 to encode (e.g. compress or gzip). The mappings are table
230 driven. Encoding suffixes are case sensitive; type suffixes are
231 first tried case sensitive, then case insensitive.
233 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
234 to ".tar.gz". (This is table-driven too, using the dictionary
235 suffix_map).
237 Optional `strict' argument when false adds a bunch of commonly found, but
238 non-standard types.
240 init()
241 return guess_type(url, strict)
244 def guess_all_extensions(type, strict=True):
245 """Guess the extensions for a file based on its MIME type.
247 Return value is a list of strings giving the possible filename
248 extensions, including the leading dot ('.'). The extension is not
249 guaranteed to have been associated with any particular data
250 stream, but would be mapped to the MIME type `type' by
251 guess_type(). If no extension can be guessed for `type', None
252 is returned.
254 Optional `strict' argument when false adds a bunch of commonly found,
255 but non-standard types.
257 init()
258 return guess_all_extensions(type, strict)
260 def guess_extension(type, strict=True):
261 """Guess the extension for a file based on its MIME type.
263 Return value is a string giving a filename extension, including the
264 leading dot ('.'). The extension is not guaranteed to have been
265 associated with any particular data stream, but would be mapped to the
266 MIME type `type' by guess_type(). If no extension can be guessed for
267 `type', None is returned.
269 Optional `strict' argument when false adds a bunch of commonly found,
270 but non-standard types.
272 init()
273 return guess_extension(type, strict)
275 def add_type(self, type, ext, strict=True):
276 """Add a mapping between a type and and extension.
278 When the extension is already known, the new
279 type will replace the old one. When the type
280 is already known the extension will be added
281 to the list of known extensions.
283 If strict is true, information will be added to
284 list of standard types, else to the list of non-standard
285 types.
287 init()
288 return add_type(type, ext, strict)
291 def init(files=None):
292 global guess_all_extensions, guess_extension, guess_type
293 global suffix_map, types_map, encodings_map, common_types
294 global add_type, inited
295 inited = True
296 db = MimeTypes()
297 if files is None:
298 files = knownfiles
299 for file in files:
300 if os.path.isfile(file):
301 db.readfp(open(file))
302 encodings_map = db.encodings_map
303 suffix_map = db.suffix_map
304 types_map = db.types_map[True]
305 guess_all_extensions = db.guess_all_extensions
306 guess_extension = db.guess_extension
307 guess_type = db.guess_type
308 add_type = db.add_type
309 common_types = db.types_map[False]
312 def read_mime_types(file):
313 try:
314 f = open(file)
315 except IOError:
316 return None
317 db = MimeTypes()
318 db.readfp(f)
319 return db.types_map
322 suffix_map = {
323 '.tgz': '.tar.gz',
324 '.taz': '.tar.gz',
325 '.tz': '.tar.gz',
328 encodings_map = {
329 '.gz': 'gzip',
330 '.Z': 'compress',
333 # Before adding new types, make sure they are either registered with IANA, at
334 # http://www.isi.edu/in-notes/iana/assignments/media-types
335 # or extensions, i.e. using the x- prefix
337 # If you add to these, please keep them sorted!
338 types_map = {
339 '.a' : 'application/octet-stream',
340 '.ai' : 'application/postscript',
341 '.aif' : 'audio/x-aiff',
342 '.aifc' : 'audio/x-aiff',
343 '.aiff' : 'audio/x-aiff',
344 '.au' : 'audio/basic',
345 '.avi' : 'video/x-msvideo',
346 '.bat' : 'text/plain',
347 '.bcpio' : 'application/x-bcpio',
348 '.bin' : 'application/octet-stream',
349 '.bmp' : 'image/x-ms-bmp',
350 '.c' : 'text/plain',
351 # Duplicates :(
352 '.cdf' : 'application/x-cdf',
353 '.cdf' : 'application/x-netcdf',
354 '.cpio' : 'application/x-cpio',
355 '.csh' : 'application/x-csh',
356 '.css' : 'text/css',
357 '.dll' : 'application/octet-stream',
358 '.doc' : 'application/msword',
359 '.dot' : 'application/msword',
360 '.dvi' : 'application/x-dvi',
361 '.eml' : 'message/rfc822',
362 '.eps' : 'application/postscript',
363 '.etx' : 'text/x-setext',
364 '.exe' : 'application/octet-stream',
365 '.gif' : 'image/gif',
366 '.gtar' : 'application/x-gtar',
367 '.h' : 'text/plain',
368 '.hdf' : 'application/x-hdf',
369 '.htm' : 'text/html',
370 '.html' : 'text/html',
371 '.ief' : 'image/ief',
372 '.jpe' : 'image/jpeg',
373 '.jpeg' : 'image/jpeg',
374 '.jpg' : 'image/jpeg',
375 '.js' : 'application/x-javascript',
376 '.ksh' : 'text/plain',
377 '.latex' : 'application/x-latex',
378 '.m1v' : 'video/mpeg',
379 '.man' : 'application/x-troff-man',
380 '.me' : 'application/x-troff-me',
381 '.mht' : 'message/rfc822',
382 '.mhtml' : 'message/rfc822',
383 '.mif' : 'application/x-mif',
384 '.mov' : 'video/quicktime',
385 '.movie' : 'video/x-sgi-movie',
386 '.mp2' : 'audio/mpeg',
387 '.mp3' : 'audio/mpeg',
388 '.mpa' : 'video/mpeg',
389 '.mpe' : 'video/mpeg',
390 '.mpeg' : 'video/mpeg',
391 '.mpg' : 'video/mpeg',
392 '.ms' : 'application/x-troff-ms',
393 '.nc' : 'application/x-netcdf',
394 '.nws' : 'message/rfc822',
395 '.o' : 'application/octet-stream',
396 '.obj' : 'application/octet-stream',
397 '.oda' : 'application/oda',
398 '.p12' : 'application/x-pkcs12',
399 '.p7c' : 'application/pkcs7-mime',
400 '.pbm' : 'image/x-portable-bitmap',
401 '.pdf' : 'application/pdf',
402 '.pfx' : 'application/x-pkcs12',
403 '.pgm' : 'image/x-portable-graymap',
404 '.pl' : 'text/plain',
405 '.png' : 'image/png',
406 '.pnm' : 'image/x-portable-anymap',
407 '.pot' : 'application/vnd.ms-powerpoint',
408 '.ppa' : 'application/vnd.ms-powerpoint',
409 '.ppm' : 'image/x-portable-pixmap',
410 '.pps' : 'application/vnd.ms-powerpoint',
411 '.ppt' : 'application/vnd.ms-powerpoint',
412 '.ps' : 'application/postscript',
413 '.pwz' : 'application/vnd.ms-powerpoint',
414 '.py' : 'text/x-python',
415 '.pyc' : 'application/x-python-code',
416 '.pyo' : 'application/x-python-code',
417 '.qt' : 'video/quicktime',
418 '.ra' : 'audio/x-pn-realaudio',
419 '.ram' : 'application/x-pn-realaudio',
420 '.ras' : 'image/x-cmu-raster',
421 '.rdf' : 'application/xml',
422 '.rgb' : 'image/x-rgb',
423 '.roff' : 'application/x-troff',
424 '.rtx' : 'text/richtext',
425 '.sgm' : 'text/x-sgml',
426 '.sgml' : 'text/x-sgml',
427 '.sh' : 'application/x-sh',
428 '.shar' : 'application/x-shar',
429 '.snd' : 'audio/basic',
430 '.so' : 'application/octet-stream',
431 '.src' : 'application/x-wais-source',
432 '.sv4cpio': 'application/x-sv4cpio',
433 '.sv4crc' : 'application/x-sv4crc',
434 '.t' : 'application/x-troff',
435 '.tar' : 'application/x-tar',
436 '.tcl' : 'application/x-tcl',
437 '.tex' : 'application/x-tex',
438 '.texi' : 'application/x-texinfo',
439 '.texinfo': 'application/x-texinfo',
440 '.tif' : 'image/tiff',
441 '.tiff' : 'image/tiff',
442 '.tr' : 'application/x-troff',
443 '.tsv' : 'text/tab-separated-values',
444 '.txt' : 'text/plain',
445 '.ustar' : 'application/x-ustar',
446 '.vcf' : 'text/x-vcard',
447 '.wav' : 'audio/x-wav',
448 '.wiz' : 'application/msword',
449 '.xbm' : 'image/x-xbitmap',
450 '.xlb' : 'application/vnd.ms-excel',
451 # Duplicates :(
452 '.xls' : 'application/excel',
453 '.xls' : 'application/vnd.ms-excel',
454 '.xml' : 'text/xml',
455 '.xpm' : 'image/x-xpixmap',
456 '.xsl' : 'application/xml',
457 '.xwd' : 'image/x-xwindowdump',
458 '.zip' : 'application/zip',
461 # These are non-standard types, commonly found in the wild. They will only
462 # match if strict=0 flag is given to the API methods.
464 # Please sort these too
465 common_types = {
466 '.jpg' : 'image/jpg',
467 '.mid' : 'audio/midi',
468 '.midi': 'audio/midi',
469 '.pct' : 'image/pict',
470 '.pic' : 'image/pict',
471 '.pict': 'image/pict',
472 '.rtf' : 'application/rtf',
473 '.xul' : 'text/xul'
477 if __name__ == '__main__':
478 import sys
479 import getopt
481 USAGE = """\
482 Usage: mimetypes.py [options] type
484 Options:
485 --help / -h -- print this message and exit
486 --lenient / -l -- additionally search of some common, but non-standard
487 types.
488 --extension / -e -- guess extension instead of type
490 More than one type argument may be given.
493 def usage(code, msg=''):
494 print USAGE
495 if msg: print msg
496 sys.exit(code)
498 try:
499 opts, args = getopt.getopt(sys.argv[1:], 'hle',
500 ['help', 'lenient', 'extension'])
501 except getopt.error, msg:
502 usage(1, msg)
504 strict = 1
505 extension = 0
506 for opt, arg in opts:
507 if opt in ('-h', '--help'):
508 usage(0)
509 elif opt in ('-l', '--lenient'):
510 strict = 0
511 elif opt in ('-e', '--extension'):
512 extension = 1
513 for gtype in args:
514 if extension:
515 guess = guess_extension(gtype, strict)
516 if not guess: print "I don't know anything about type", gtype
517 else: print guess
518 else:
519 guess, encoding = guess_type(gtype, strict)
520 if not guess: print "I don't know anything about type", gtype
521 else: print 'type:', guess, 'encoding:', encoding