Lib/mimetypes.py

   1 """Guess the MIME type of a file.
   2
   3 This module defines two useful functions:
   4
   5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
   6
   7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
   8
   9 It also contains the following, for tuning the behavior:
  10
  11 Data:
  12
  13 knownfiles -- list of files to parse
  14 inited -- flag set when init() has been called
  15 suffix_map -- dictionary mapping suffixes to suffixes
  16 encodings_map -- dictionary mapping suffixes to encodings
  17 types_map -- dictionary mapping suffixes to types
  18
  19 Functions:
  20
  21 init([files]) -- parse a list of files, default knownfiles
  22 read_mime_types(file) -- parse one file, return a dictionary or None
  23 """
  24
  25 import os
  26 import posixpath
  27 import urllib
  28
  29 __all__ = ["guess_type","guess_extension","read_mime_types","init"]
  30
  31 knownfiles = [
  32     "/usr/local/etc/httpd/conf/mime.types",
  33     "/usr/local/lib/netscape/mime.types",
  34     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
  35     "/usr/local/etc/mime.types",                # Apache 1.3
  36     ]
  37
  38 inited = 0
  39
  40
  41 class MimeTypes:
  42     """MIME-types datastore.
  43
  44     This datastore can handle information from mime.types-style files
  45     and supports basic determination of MIME type from a filename or
  46     URL, and can guess a reasonable extension given a MIME type.
  47     """
  48
  49     def __init__(self, filenames=()):
  50         if not inited:
  51             init()
  52         self.encodings_map = encodings_map.copy()
  53         self.suffix_map = suffix_map.copy()
  54         self.types_map = types_map.copy()
  55         self.common_types = common_types.copy()
  56         for name in filenames:
  57             self.read(name)
  58
  59     def guess_type(self, url, strict=1):
  60         """Guess the type of a file based on its URL.
  61
  62         Return value is a tuple (type, encoding) where type is None if
  63         the type can't be guessed (no or unknown suffix) or a string
  64         of the form type/subtype, usable for a MIME Content-type
  65         header; and encoding is None for no encoding or the name of
  66         the program used to encode (e.g. compress or gzip).  The
  67         mappings are table driven.  Encoding suffixes are case
  68         sensitive; type suffixes are first tried case sensitive, then
  69         case insensitive.
  70
  71         The suffixes .tgz, .taz and .tz (case sensitive!) are all
  72         mapped to '.tar.gz'.  (This is table-driven too, using the
  73         dictionary suffix_map.)
  74
  75         Optional `strict' argument when false adds a bunch of commonly found,
  76         but non-standard types.
  77         """
  78         scheme, url = urllib.splittype(url)
  79         if scheme == 'data':
  80             # syntax of data URLs:
  81             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
  82             # mediatype := [ type "/" subtype ] *( ";" parameter )
  83             # data      := *urlchar
  84             # parameter := attribute "=" value
  85             # type/subtype defaults to "text/plain"
  86             comma = url.find(',')
  87             if comma < 0:
  88                 # bad data URL
  89                 return None, None
  90             semi = url.find(';', 0, comma)
  91             if semi >= 0:
  92                 type = url[:semi]
  93             else:
  94                 type = url[:comma]
  95             if '=' in type or '/' not in type:
  96                 type = 'text/plain'
  97             return type, None           # never compressed, so encoding is None
  98         base, ext = posixpath.splitext(url)
  99         while ext in self.suffix_map:
 100             base, ext = posixpath.splitext(base + self.suffix_map[ext])
 101         if ext in self.encodings_map:
 102             encoding = self.encodings_map[ext]
 103             base, ext = posixpath.splitext(base)
 104         else:
 105             encoding = None
 106         types_map = self.types_map
 107         common_types = self.common_types
 108         if ext in types_map:
 109             return types_map[ext], encoding
 110         elif ext.lower() in types_map:
 111             return types_map[ext.lower()], encoding
 112         elif strict:
 113             return None, encoding
 114         elif ext in common_types:
 115             return common_types[ext], encoding
 116         elif ext.lower() in common_types:
 117             return common_types[ext.lower()], encoding
 118         else:
 119             return None, encoding
 120
 121     def guess_extension(self, type, strict=1):
 122         """Guess the extension for a file based on its MIME type.
 123
 124         Return value is a string giving a filename extension,
 125         including the leading dot ('.').  The extension is not
 126         guaranteed to have been associated with any particular data
 127         stream, but would be mapped to the MIME type `type' by
 128         guess_type().  If no extension can be guessed for `type', None
 129         is returned.
 130
 131         Optional `strict' argument when false adds a bunch of commonly found,
 132         but non-standard types.
 133         """
 134         type = type.lower()
 135         for ext, stype in self.types_map.items():
 136             if type == stype:
 137                 return ext
 138         if not strict:
 139             for ext, stype in common_types.items():
 140                 if type == stype:
 141                     return ext
 142         return None
 143
 144     def read(self, filename):
 145         """Read a single mime.types-format file, specified by pathname."""
 146         fp = open(filename)
 147         self.readfp(fp)
 148         fp.close()
 149
 150     def readfp(self, fp):
 151         """Read a single mime.types-format file."""
 152         map = self.types_map
 153         while 1:
 154             line = fp.readline()
 155             if not line:
 156                 break
 157             words = line.split()
 158             for i in range(len(words)):
 159                 if words[i][0] == '#':
 160                     del words[i:]
 161                     break
 162             if not words:
 163                 continue
 164             type, suffixes = words[0], words[1:]
 165             for suff in suffixes:
 166                 map['.' + suff] = type
 167
 168
 169 def guess_type(url, strict=1):
 170     """Guess the type of a file based on its URL.
 171
 172     Return value is a tuple (type, encoding) where type is None if the
 173     type can't be guessed (no or unknown suffix) or a string of the
 174     form type/subtype, usable for a MIME Content-type header; and
 175     encoding is None for no encoding or the name of the program used
 176     to encode (e.g. compress or gzip).  The mappings are table
 177     driven.  Encoding suffixes are case sensitive; type suffixes are
 178     first tried case sensitive, then case insensitive.
 179
 180     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
 181     to ".tar.gz".  (This is table-driven too, using the dictionary
 182     suffix_map).
 183
 184     Optional `strict' argument when false adds a bunch of commonly found, but
 185     non-standard types.
 186     """
 187     init()
 188     return guess_type(url, strict)
 189
 190
 191 def guess_extension(type, strict=1):
 192     """Guess the extension for a file based on its MIME type.
 193
 194     Return value is a string giving a filename extension, including the
 195     leading dot ('.').  The extension is not guaranteed to have been
 196     associated with any particular data stream, but would be mapped to the
 197     MIME type `type' by guess_type().  If no extension can be guessed for
 198     `type', None is returned.
 199
 200     Optional `strict' argument when false adds a bunch of commonly found,
 201     but non-standard types.
 202     """
 203     init()
 204     return guess_extension(type, strict)
 205
 206
 207 def init(files=None):
 208     global guess_extension, guess_type
 209     global suffix_map, types_map, encodings_map, common_types
 210     global inited
 211     inited = 1
 212     db = MimeTypes()
 213     if files is None:
 214         files = knownfiles
 215     for file in files:
 216         if os.path.isfile(file):
 217             db.readfp(open(file))
 218     encodings_map = db.encodings_map
 219     suffix_map = db.suffix_map
 220     types_map = db.types_map
 221     guess_extension = db.guess_extension
 222     guess_type = db.guess_type
 223     common_types = db.common_types
 224
 225
 226 def read_mime_types(file):
 227     try:
 228         f = open(file)
 229     except IOError:
 230         return None
 231     db = MimeTypes()
 232     db.readfp(f)
 233     return db.types_map
 234
 235
 236 suffix_map = {
 237     '.tgz': '.tar.gz',
 238     '.taz': '.tar.gz',
 239     '.tz': '.tar.gz',
 240     }
 241
 242 encodings_map = {
 243     '.gz': 'gzip',
 244     '.Z': 'compress',
 245     }
 246
 247 # Before adding new types, make sure they are either registered with IANA, at
 248 # http://www.isi.edu/in-notes/iana/assignments/media-types
 249 # or extensions, i.e. using the x- prefix
 250
 251 # If you add to these, please keep them sorted!
 252 types_map = {
 253     '.a'      : 'application/octet-stream',
 254     '.ai'     : 'application/postscript',
 255     '.aif'    : 'audio/x-aiff',
 256     '.aifc'   : 'audio/x-aiff',
 257     '.aiff'   : 'audio/x-aiff',
 258     '.au'     : 'audio/basic',
 259     '.avi'    : 'video/x-msvideo',
 260     '.bat'    : 'text/plain',
 261     '.bcpio'  : 'application/x-bcpio',
 262     '.bin'    : 'application/octet-stream',
 263     '.bmp'    : 'image/x-ms-bmp',
 264     '.c'      : 'text/plain',
 265     # Duplicates :(
 266     '.cdf'    : 'application/x-cdf',
 267     '.cdf'    : 'application/x-netcdf',
 268     '.cpio'   : 'application/x-cpio',
 269     '.csh'    : 'application/x-csh',
 270     '.css'    : 'text/css',
 271     '.dll'    : 'application/octet-stream',
 272     '.doc'    : 'application/msword',
 273     '.dot'    : 'application/msword',
 274     '.dvi'    : 'application/x-dvi',
 275     '.eml'    : 'message/rfc822',
 276     '.eps'    : 'application/postscript',
 277     '.etx'    : 'text/x-setext',
 278     '.exe'    : 'application/octet-stream',
 279     '.gif'    : 'image/gif',
 280     '.gtar'   : 'application/x-gtar',
 281     '.h'      : 'text/plain',
 282     '.hdf'    : 'application/x-hdf',
 283     '.htm'    : 'text/html',
 284     '.html'   : 'text/html',
 285     '.ief'    : 'image/ief',
 286     '.jpe'    : 'image/jpeg',
 287     '.jpeg'   : 'image/jpeg',
 288     '.jpg'    : 'image/jpeg',
 289     '.js'     : 'application/x-javascript',
 290     '.ksh'    : 'text/plain',
 291     '.latex'  : 'application/x-latex',
 292     '.m1v'    : 'video/mpeg',
 293     '.man'    : 'application/x-troff-man',
 294     '.me'     : 'application/x-troff-me',
 295     '.mht'    : 'message/rfc822',
 296     '.mhtml'  : 'message/rfc822',
 297     '.mif'    : 'application/x-mif',
 298     '.mov'    : 'video/quicktime',
 299     '.movie'  : 'video/x-sgi-movie',
 300     '.mp2'    : 'audio/mpeg',
 301     '.mp3'    : 'audio/mpeg',
 302     '.mpa'    : 'video/mpeg',
 303     '.mpe'    : 'video/mpeg',
 304     '.mpeg'   : 'video/mpeg',
 305     '.mpg'    : 'video/mpeg',
 306     '.ms'     : 'application/x-troff-ms',
 307     '.nc'     : 'application/x-netcdf',
 308     '.nws'    : 'message/rfc822',
 309     '.o'      : 'application/octet-stream',
 310     '.obj'    : 'application/octet-stream',
 311     '.oda'    : 'application/oda',
 312     '.p12'    : 'application/x-pkcs12',
 313     '.p7c'    : 'application/pkcs7-mime',
 314     '.pbm'    : 'image/x-portable-bitmap',
 315     '.pdf'    : 'application/pdf',
 316     '.pfx'    : 'application/x-pkcs12',
 317     '.pgm'    : 'image/x-portable-graymap',
 318     '.pl'     : 'text/plain',
 319     '.png'    : 'image/png',
 320     '.pnm'    : 'image/x-portable-anymap',
 321     '.pot'    : 'application/vnd.ms-powerpoint',
 322     '.ppa'    : 'application/vnd.ms-powerpoint',
 323     '.ppm'    : 'image/x-portable-pixmap',
 324     '.pps'    : 'application/vnd.ms-powerpoint',
 325     '.ppt'    : 'application/vnd.ms-powerpoint',
 326     '.ps'     : 'application/postscript',
 327     '.pwz'    : 'application/vnd.ms-powerpoint',
 328     '.py'     : 'text/x-python',
 329     '.pyc'    : 'application/x-python-code',
 330     '.pyo'    : 'application/x-python-code',
 331     '.qt'     : 'video/quicktime',
 332     '.ra'     : 'audio/x-pn-realaudio',
 333     '.ram'    : 'application/x-pn-realaudio',
 334     '.ras'    : 'image/x-cmu-raster',
 335     '.rdf'    : 'application/xml',
 336     '.rgb'    : 'image/x-rgb',
 337     '.roff'   : 'application/x-troff',
 338     '.rtx'    : 'text/richtext',
 339     '.sgm'    : 'text/x-sgml',
 340     '.sgml'   : 'text/x-sgml',
 341     '.sh'     : 'application/x-sh',
 342     '.shar'   : 'application/x-shar',
 343     '.snd'    : 'audio/basic',
 344     '.so'     : 'application/octet-stream',
 345     '.src'    : 'application/x-wais-source',
 346     '.sv4cpio': 'application/x-sv4cpio',
 347     '.sv4crc' : 'application/x-sv4crc',
 348     '.t'      : 'application/x-troff',
 349     '.tar'    : 'application/x-tar',
 350     '.tcl'    : 'application/x-tcl',
 351     '.tex'    : 'application/x-tex',
 352     '.texi'   : 'application/x-texinfo',
 353     '.texinfo': 'application/x-texinfo',
 354     '.tif'    : 'image/tiff',
 355     '.tiff'   : 'image/tiff',
 356     '.tr'     : 'application/x-troff',
 357     '.tsv'    : 'text/tab-separated-values',
 358     '.txt'    : 'text/plain',
 359     '.ustar'  : 'application/x-ustar',
 360     '.vcf'    : 'text/x-vcard',
 361     '.wav'    : 'audio/x-wav',
 362     '.wiz'    : 'application/msword',
 363     '.xbm'    : 'image/x-xbitmap',
 364     '.xlb'    : 'application/vnd.ms-excel',
 365     # Duplicates :(
 366     '.xls'    : 'application/excel',
 367     '.xls'    : 'application/vnd.ms-excel',
 368     '.xml'    : 'text/xml',
 369     '.xpm'    : 'image/x-xpixmap',
 370     '.xsl'    : 'application/xml',
 371     '.xwd'    : 'image/x-xwindowdump',
 372     '.zip'    : 'application/zip',
 373     }
 374
 375 # These are non-standard types, commonly found in the wild.  They will only
 376 # match if strict=0 flag is given to the API methods.
 377
 378 # Please sort these too
 379 common_types = {
 380     '.jpg' : 'image/jpg',
 381     '.mid' : 'audio/midi',
 382     '.midi': 'audio/midi',
 383     '.pct' : 'image/pict',
 384     '.pic' : 'image/pict',
 385     '.pict': 'image/pict',
 386     '.rtf' : 'application/rtf',
 387     '.xul' : 'text/xul'
 388     }
 389
 390
 391 if __name__ == '__main__':
 392     import sys
 393     import getopt
 394
 395     USAGE = """\
 396 Usage: mimetypes.py [options] type
 397
 398 Options:
 399     --help / -h       -- print this message and exit
 400     --lenient / -l    -- additionally search of some common, but non-standard
 401                          types.
 402     --extension / -e  -- guess extension instead of type
 403
 404 More than one type argument may be given.
 405 """
 406
 407     def usage(code, msg=''):
 408         print USAGE
 409         if msg: print msg
 410         sys.exit(code)
 411
 412     try:
 413         opts, args = getopt.getopt(sys.argv[1:], 'hle',
 414                                    ['help', 'lenient', 'extension'])
 415     except getopt.error, msg:
 416         usage(1, msg)
 417
 418     strict = 1
 419     extension = 0
 420     for opt, arg in opts:
 421         if opt in ('-h', '--help'):
 422             usage(0)
 423         elif opt in ('-l', '--lenient'):
 424             strict = 0
 425         elif opt in ('-e', '--extension'):
 426             extension = 1
 427     for gtype in args:
 428         if extension:
 429             guess = guess_extension(gtype, strict)
 430             if not guess: print "I don't know anything about type", gtype
 431             else: print guess
 432         else:
 433             guess, encoding = guess_type(gtype, strict)
 434             if not guess: print "I don't know anything about type", gtype
 435             else: print 'type:', guess, 'encoding:', encoding