Lib/mimetypes.py

   1 """Guess the MIME type of a file.
   2
   3 This module defines two useful functions:
   4
   5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
   6
   7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
   8
   9 It also contains the following, for tuning the behavior:
  10
  11 Data:
  12
  13 knownfiles -- list of files to parse
  14 inited -- flag set when init() has been called
  15 suffix_map -- dictionary mapping suffixes to suffixes
  16 encodings_map -- dictionary mapping suffixes to encodings
  17 types_map -- dictionary mapping suffixes to types
  18
  19 Functions:
  20
  21 init([files]) -- parse a list of files, default knownfiles
  22 read_mime_types(file) -- parse one file, return a dictionary or None
  23 """
  24
  25 import os
  26 import posixpath
  27 import urllib
  28
  29 __all__ = [
  30     "guess_type","guess_extension","guess_all_extensions",
  31     "add_type","read_mime_types","init"
  32 ]
  33
  34 knownfiles = [
  35     "/etc/mime.types",
  36     "/usr/local/etc/httpd/conf/mime.types",
  37     "/usr/local/lib/netscape/mime.types",
  38     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
  39     "/usr/local/etc/mime.types",                # Apache 1.3
  40     ]
  41
  42 inited = False
  43
  44
  45 class MimeTypes:
  46     """MIME-types datastore.
  47
  48     This datastore can handle information from mime.types-style files
  49     and supports basic determination of MIME type from a filename or
  50     URL, and can guess a reasonable extension given a MIME type.
  51     """
  52
  53     def __init__(self, filenames=(), strict=True):
  54         if not inited:
  55             init()
  56         self.encodings_map = encodings_map.copy()
  57         self.suffix_map = suffix_map.copy()
  58         self.types_map = ({}, {}) # dict for (non-strict, strict)
  59         self.types_map_inv = ({}, {})
  60         for (ext, type) in types_map.items():
  61             self.add_type(type, ext, True)
  62         for (ext, type) in common_types.items():
  63             self.add_type(type, ext, False)
  64         for name in filenames:
  65             self.read(name, strict)
  66
  67     def add_type(self, type, ext, strict=True):
  68         """Add a mapping between a type and and extension.
  69
  70         When the extension is already known, the new
  71         type will replace the old one. When the type
  72         is already known the extension will be added
  73         to the list of known extensions.
  74
  75         If strict is true, information will be added to
  76         list of standard types, else to the list of non-standard
  77         types.
  78         """
  79         self.types_map[strict][ext] = type
  80         exts = self.types_map_inv[strict].setdefault(type, [])
  81         if ext not in exts:
  82             exts.append(ext)
  83
  84     def guess_type(self, url, strict=True):
  85         """Guess the type of a file based on its URL.
  86
  87         Return value is a tuple (type, encoding) where type is None if
  88         the type can't be guessed (no or unknown suffix) or a string
  89         of the form type/subtype, usable for a MIME Content-type
  90         header; and encoding is None for no encoding or the name of
  91         the program used to encode (e.g. compress or gzip).  The
  92         mappings are table driven.  Encoding suffixes are case
  93         sensitive; type suffixes are first tried case sensitive, then
  94         case insensitive.
  95
  96         The suffixes .tgz, .taz and .tz (case sensitive!) are all
  97         mapped to '.tar.gz'.  (This is table-driven too, using the
  98         dictionary suffix_map.)
  99
 100         Optional `strict' argument when False adds a bunch of commonly found,
 101         but non-standard types.
 102         """
 103         scheme, url = urllib.splittype(url)
 104         if scheme == 'data':
 105             # syntax of data URLs:
 106             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 107             # mediatype := [ type "/" subtype ] *( ";" parameter )
 108             # data      := *urlchar
 109             # parameter := attribute "=" value
 110             # type/subtype defaults to "text/plain"
 111             comma = url.find(',')
 112             if comma < 0:
 113                 # bad data URL
 114                 return None, None
 115             semi = url.find(';', 0, comma)
 116             if semi >= 0:
 117                 type = url[:semi]
 118             else:
 119                 type = url[:comma]
 120             if '=' in type or '/' not in type:
 121                 type = 'text/plain'
 122             return type, None           # never compressed, so encoding is None
 123         base, ext = posixpath.splitext(url)
 124         while ext in self.suffix_map:
 125             base, ext = posixpath.splitext(base + self.suffix_map[ext])
 126         if ext in self.encodings_map:
 127             encoding = self.encodings_map[ext]
 128             base, ext = posixpath.splitext(base)
 129         else:
 130             encoding = None
 131         types_map = self.types_map[True]
 132         if ext in types_map:
 133             return types_map[ext], encoding
 134         elif ext.lower() in types_map:
 135             return types_map[ext.lower()], encoding
 136         elif strict:
 137             return None, encoding
 138         types_map = self.types_map[False]
 139         if ext in types_map:
 140             return types_map[ext], encoding
 141         elif ext.lower() in types_map:
 142             return types_map[ext.lower()], encoding
 143         else:
 144             return None, encoding
 145
 146     def guess_all_extensions(self, type, strict=True):
 147         """Guess the extensions for a file based on its MIME type.
 148
 149         Return value is a list of strings giving the possible filename
 150         extensions, including the leading dot ('.').  The extension is not
 151         guaranteed to have been associated with any particular data
 152         stream, but would be mapped to the MIME type `type' by
 153         guess_type().  If no extension can be guessed for `type', None
 154         is returned.
 155
 156         Optional `strict' argument when false adds a bunch of commonly found,
 157         but non-standard types.
 158         """
 159         type = type.lower()
 160         extensions = self.types_map_inv[True].get(type, [])
 161         if not strict:
 162             for ext in self.types_map_inv[False].get(type, []):
 163                 if ext not in extensions:
 164                     extensions.append(ext)
 165         if len(extensions):
 166             return extensions
 167
 168     def guess_extension(self, type, strict=True):
 169         """Guess the extension for a file based on its MIME type.
 170
 171         Return value is a string giving a filename extension,
 172         including the leading dot ('.').  The extension is not
 173         guaranteed to have been associated with any particular data
 174         stream, but would be mapped to the MIME type `type' by
 175         guess_type().  If no extension can be guessed for `type', None
 176         is returned.
 177
 178         Optional `strict' argument when false adds a bunch of commonly found,
 179         but non-standard types.
 180         """
 181         extensions = self.guess_all_extensions(type, strict)
 182         if extensions is not None:
 183             extensions = extensions[0]
 184         return extensions
 185
 186     def read(self, filename, strict=True):
 187         """
 188         Read a single mime.types-format file, specified by pathname.
 189
 190         If strict is true, information will be added to
 191         list of standard types, else to the list of non-standard
 192         types.
 193         """
 194         fp = open(filename)
 195         self.readfp(fp)
 196         fp.close()
 197
 198     def readfp(self, fp, strict=True):
 199         """
 200         Read a single mime.types-format file.
 201
 202         If strict is true, information will be added to
 203         list of standard types, else to the list of non-standard
 204         types.
 205         """
 206         while 1:
 207             line = fp.readline()
 208             if not line:
 209                 break
 210             words = line.split()
 211             for i in range(len(words)):
 212                 if words[i][0] == '#':
 213                     del words[i:]
 214                     break
 215             if not words:
 216                 continue
 217             type, suffixes = words[0], words[1:]
 218             suffixes = [ '.' + suff for suff in suffixes ]
 219             for suff in suffixes:
 220                 self.add_type(type, suff, strict)
 221
 222 def guess_type(url, strict=True):
 223     """Guess the type of a file based on its URL.
 224
 225     Return value is a tuple (type, encoding) where type is None if the
 226     type can't be guessed (no or unknown suffix) or a string of the
 227     form type/subtype, usable for a MIME Content-type header; and
 228     encoding is None for no encoding or the name of the program used
 229     to encode (e.g. compress or gzip).  The mappings are table
 230     driven.  Encoding suffixes are case sensitive; type suffixes are
 231     first tried case sensitive, then case insensitive.
 232
 233     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
 234     to ".tar.gz".  (This is table-driven too, using the dictionary
 235     suffix_map).
 236
 237     Optional `strict' argument when false adds a bunch of commonly found, but
 238     non-standard types.
 239     """
 240     init()
 241     return guess_type(url, strict)
 242
 243
 244 def guess_all_extensions(type, strict=True):
 245     """Guess the extensions for a file based on its MIME type.
 246
 247     Return value is a list of strings giving the possible filename
 248     extensions, including the leading dot ('.').  The extension is not
 249     guaranteed to have been associated with any particular data
 250     stream, but would be mapped to the MIME type `type' by
 251     guess_type().  If no extension can be guessed for `type', None
 252     is returned.
 253
 254     Optional `strict' argument when false adds a bunch of commonly found,
 255     but non-standard types.
 256     """
 257     init()
 258     return guess_all_extensions(type, strict)
 259
 260 def guess_extension(type, strict=True):
 261     """Guess the extension for a file based on its MIME type.
 262
 263     Return value is a string giving a filename extension, including the
 264     leading dot ('.').  The extension is not guaranteed to have been
 265     associated with any particular data stream, but would be mapped to the
 266     MIME type `type' by guess_type().  If no extension can be guessed for
 267     `type', None is returned.
 268
 269     Optional `strict' argument when false adds a bunch of commonly found,
 270     but non-standard types.
 271     """
 272     init()
 273     return guess_extension(type, strict)
 274
 275 def add_type(self, type, ext, strict=True):
 276     """Add a mapping between a type and and extension.
 277
 278     When the extension is already known, the new
 279     type will replace the old one. When the type
 280     is already known the extension will be added
 281     to the list of known extensions.
 282
 283     If strict is true, information will be added to
 284     list of standard types, else to the list of non-standard
 285     types.
 286     """
 287     init()
 288     return add_type(type, ext, strict)
 289
 290
 291 def init(files=None):
 292     global guess_all_extensions, guess_extension, guess_type
 293     global suffix_map, types_map, encodings_map, common_types
 294     global add_type, inited
 295     inited = True
 296     db = MimeTypes()
 297     if files is None:
 298         files = knownfiles
 299     for file in files:
 300         if os.path.isfile(file):
 301             db.readfp(open(file))
 302     encodings_map = db.encodings_map
 303     suffix_map = db.suffix_map
 304     types_map = db.types_map[True]
 305     guess_all_extensions = db.guess_all_extensions
 306     guess_extension = db.guess_extension
 307     guess_type = db.guess_type
 308     add_type = db.add_type
 309     common_types = db.types_map[False]
 310
 311
 312 def read_mime_types(file):
 313     try:
 314         f = open(file)
 315     except IOError:
 316         return None
 317     db = MimeTypes()
 318     db.readfp(f)
 319     return db.types_map
 320
 321
 322 suffix_map = {
 323     '.tgz': '.tar.gz',
 324     '.taz': '.tar.gz',
 325     '.tz': '.tar.gz',
 326     }
 327
 328 encodings_map = {
 329     '.gz': 'gzip',
 330     '.Z': 'compress',
 331     }
 332
 333 # Before adding new types, make sure they are either registered with IANA, at
 334 # http://www.isi.edu/in-notes/iana/assignments/media-types
 335 # or extensions, i.e. using the x- prefix
 336
 337 # If you add to these, please keep them sorted!
 338 types_map = {
 339     '.a'      : 'application/octet-stream',
 340     '.ai'     : 'application/postscript',
 341     '.aif'    : 'audio/x-aiff',
 342     '.aifc'   : 'audio/x-aiff',
 343     '.aiff'   : 'audio/x-aiff',
 344     '.au'     : 'audio/basic',
 345     '.avi'    : 'video/x-msvideo',
 346     '.bat'    : 'text/plain',
 347     '.bcpio'  : 'application/x-bcpio',
 348     '.bin'    : 'application/octet-stream',
 349     '.bmp'    : 'image/x-ms-bmp',
 350     '.c'      : 'text/plain',
 351     # Duplicates :(
 352     '.cdf'    : 'application/x-cdf',
 353     '.cdf'    : 'application/x-netcdf',
 354     '.cpio'   : 'application/x-cpio',
 355     '.csh'    : 'application/x-csh',
 356     '.css'    : 'text/css',
 357     '.dll'    : 'application/octet-stream',
 358     '.doc'    : 'application/msword',
 359     '.dot'    : 'application/msword',
 360     '.dvi'    : 'application/x-dvi',
 361     '.eml'    : 'message/rfc822',
 362     '.eps'    : 'application/postscript',
 363     '.etx'    : 'text/x-setext',
 364     '.exe'    : 'application/octet-stream',
 365     '.gif'    : 'image/gif',
 366     '.gtar'   : 'application/x-gtar',
 367     '.h'      : 'text/plain',
 368     '.hdf'    : 'application/x-hdf',
 369     '.htm'    : 'text/html',
 370     '.html'   : 'text/html',
 371     '.ief'    : 'image/ief',
 372     '.jpe'    : 'image/jpeg',
 373     '.jpeg'   : 'image/jpeg',
 374     '.jpg'    : 'image/jpeg',
 375     '.js'     : 'application/x-javascript',
 376     '.ksh'    : 'text/plain',
 377     '.latex'  : 'application/x-latex',
 378     '.m1v'    : 'video/mpeg',
 379     '.man'    : 'application/x-troff-man',
 380     '.me'     : 'application/x-troff-me',
 381     '.mht'    : 'message/rfc822',
 382     '.mhtml'  : 'message/rfc822',
 383     '.mif'    : 'application/x-mif',
 384     '.mov'    : 'video/quicktime',
 385     '.movie'  : 'video/x-sgi-movie',
 386     '.mp2'    : 'audio/mpeg',
 387     '.mp3'    : 'audio/mpeg',
 388     '.mpa'    : 'video/mpeg',
 389     '.mpe'    : 'video/mpeg',
 390     '.mpeg'   : 'video/mpeg',
 391     '.mpg'    : 'video/mpeg',
 392     '.ms'     : 'application/x-troff-ms',
 393     '.nc'     : 'application/x-netcdf',
 394     '.nws'    : 'message/rfc822',
 395     '.o'      : 'application/octet-stream',
 396     '.obj'    : 'application/octet-stream',
 397     '.oda'    : 'application/oda',
 398     '.p12'    : 'application/x-pkcs12',
 399     '.p7c'    : 'application/pkcs7-mime',
 400     '.pbm'    : 'image/x-portable-bitmap',
 401     '.pdf'    : 'application/pdf',
 402     '.pfx'    : 'application/x-pkcs12',
 403     '.pgm'    : 'image/x-portable-graymap',
 404     '.pl'     : 'text/plain',
 405     '.png'    : 'image/png',
 406     '.pnm'    : 'image/x-portable-anymap',
 407     '.pot'    : 'application/vnd.ms-powerpoint',
 408     '.ppa'    : 'application/vnd.ms-powerpoint',
 409     '.ppm'    : 'image/x-portable-pixmap',
 410     '.pps'    : 'application/vnd.ms-powerpoint',
 411     '.ppt'    : 'application/vnd.ms-powerpoint',
 412     '.ps'     : 'application/postscript',
 413     '.pwz'    : 'application/vnd.ms-powerpoint',
 414     '.py'     : 'text/x-python',
 415     '.pyc'    : 'application/x-python-code',
 416     '.pyo'    : 'application/x-python-code',
 417     '.qt'     : 'video/quicktime',
 418     '.ra'     : 'audio/x-pn-realaudio',
 419     '.ram'    : 'application/x-pn-realaudio',
 420     '.ras'    : 'image/x-cmu-raster',
 421     '.rdf'    : 'application/xml',
 422     '.rgb'    : 'image/x-rgb',
 423     '.roff'   : 'application/x-troff',
 424     '.rtx'    : 'text/richtext',
 425     '.sgm'    : 'text/x-sgml',
 426     '.sgml'   : 'text/x-sgml',
 427     '.sh'     : 'application/x-sh',
 428     '.shar'   : 'application/x-shar',
 429     '.snd'    : 'audio/basic',
 430     '.so'     : 'application/octet-stream',
 431     '.src'    : 'application/x-wais-source',
 432     '.sv4cpio': 'application/x-sv4cpio',
 433     '.sv4crc' : 'application/x-sv4crc',
 434     '.t'      : 'application/x-troff',
 435     '.tar'    : 'application/x-tar',
 436     '.tcl'    : 'application/x-tcl',
 437     '.tex'    : 'application/x-tex',
 438     '.texi'   : 'application/x-texinfo',
 439     '.texinfo': 'application/x-texinfo',
 440     '.tif'    : 'image/tiff',
 441     '.tiff'   : 'image/tiff',
 442     '.tr'     : 'application/x-troff',
 443     '.tsv'    : 'text/tab-separated-values',
 444     '.txt'    : 'text/plain',
 445     '.ustar'  : 'application/x-ustar',
 446     '.vcf'    : 'text/x-vcard',
 447     '.wav'    : 'audio/x-wav',
 448     '.wiz'    : 'application/msword',
 449     '.xbm'    : 'image/x-xbitmap',
 450     '.xlb'    : 'application/vnd.ms-excel',
 451     # Duplicates :(
 452     '.xls'    : 'application/excel',
 453     '.xls'    : 'application/vnd.ms-excel',
 454     '.xml'    : 'text/xml',
 455     '.xpm'    : 'image/x-xpixmap',
 456     '.xsl'    : 'application/xml',
 457     '.xwd'    : 'image/x-xwindowdump',
 458     '.zip'    : 'application/zip',
 459     }
 460
 461 # These are non-standard types, commonly found in the wild.  They will only
 462 # match if strict=0 flag is given to the API methods.
 463
 464 # Please sort these too
 465 common_types = {
 466     '.jpg' : 'image/jpg',
 467     '.mid' : 'audio/midi',
 468     '.midi': 'audio/midi',
 469     '.pct' : 'image/pict',
 470     '.pic' : 'image/pict',
 471     '.pict': 'image/pict',
 472     '.rtf' : 'application/rtf',
 473     '.xul' : 'text/xul'
 474     }
 475
 476
 477 if __name__ == '__main__':
 478     import sys
 479     import getopt
 480
 481     USAGE = """\
 482 Usage: mimetypes.py [options] type
 483
 484 Options:
 485     --help / -h       -- print this message and exit
 486     --lenient / -l    -- additionally search of some common, but non-standard
 487                          types.
 488     --extension / -e  -- guess extension instead of type
 489
 490 More than one type argument may be given.
 491 """
 492
 493     def usage(code, msg=''):
 494         print USAGE
 495         if msg: print msg
 496         sys.exit(code)
 497
 498     try:
 499         opts, args = getopt.getopt(sys.argv[1:], 'hle',
 500                                    ['help', 'lenient', 'extension'])
 501     except getopt.error, msg:
 502         usage(1, msg)
 503
 504     strict = 1
 505     extension = 0
 506     for opt, arg in opts:
 507         if opt in ('-h', '--help'):
 508             usage(0)
 509         elif opt in ('-l', '--lenient'):
 510             strict = 0
 511         elif opt in ('-e', '--extension'):
 512             extension = 1
 513     for gtype in args:
 514         if extension:
 515             guess = guess_extension(gtype, strict)
 516             if not guess: print "I don't know anything about type", gtype
 517             else: print guess
 518         else:
 519             guess, encoding = guess_type(gtype, strict)
 520             if not guess: print "I don't know anything about type", gtype
 521             else: print 'type:', guess, 'encoding:', encoding