Python/codecs.c

   1 /* ------------------------------------------------------------------------
   2
   3    Python Codec Registry and support functions
   4
   5 Written by Marc-Andre Lemburg (mal@lemburg.com).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9    ------------------------------------------------------------------------ */
  10
  11 #include "Python.h"
  12 #include <ctype.h>
  13 #ifdef HAVE_LIMITS_H
  14 #include <limits.h>
  15 #endif
  16
  17 /* --- Globals ------------------------------------------------------------ */
  18
  19 static PyObject *_PyCodec_SearchPath;
  20 static PyObject *_PyCodec_SearchCache;
  21
  22 /* Flag used for lazy import of the standard encodings package */
  23 static int import_encodings_called = 0;
  24
  25 /* --- Codec Registry ----------------------------------------------------- */
  26
  27 /* Import the standard encodings package which will register the first
  28    codec search function.
  29
  30    This is done in a lazy way so that the Unicode implementation does
  31    not downgrade startup time of scripts not needing it.
  32
  33    ImportErrors are silently ignored by this function. Only one try is
  34    made.
  35
  36 */
  37
  38 static
  39 int import_encodings(void)
  40 {
  41     PyObject *mod;
  42
  43     import_encodings_called = 1;
  44     mod = PyImport_ImportModule("encodings");
  45     if (mod == NULL) {
  46         if (PyErr_ExceptionMatches(PyExc_ImportError)) {
  47             /* Ignore ImportErrors... this is done so that
  48                distributions can disable the encodings package. Note
  49                that other errors are not masked, e.g. SystemErrors
  50                raised to inform the user of an error in the Python
  51                configuration are still reported back to the user. */
  52             PyErr_Clear();
  53             return 0;
  54         }
  55         return -1;
  56     }
  57     Py_DECREF(mod);
  58     return 0;
  59 }
  60
  61 int PyCodec_Register(PyObject *search_function)
  62 {
  63     if (!import_encodings_called) {
  64         if (import_encodings())
  65             goto onError;
  66     }
  67     if (search_function == NULL) {
  68         PyErr_BadArgument();
  69         goto onError;
  70     }
  71     if (!PyCallable_Check(search_function)) {
  72         PyErr_SetString(PyExc_TypeError,
  73                         "argument must be callable");
  74         goto onError;
  75     }
  76     return PyList_Append(_PyCodec_SearchPath, search_function);
  77
  78  onError:
  79     return -1;
  80 }
  81
  82 /* Convert a string to a normalized Python string: all characters are
  83    converted to lower case, spaces are replaced with underscores. */
  84
  85 static
  86 PyObject *normalizestring(const char *string)
  87 {
  88     register size_t i;
  89     size_t len = strlen(string);
  90     char *p;
  91     PyObject *v;
  92
  93         if (len > INT_MAX) {
  94                 PyErr_SetString(PyExc_OverflowError, "string is too large");
  95                 return NULL;
  96         }
  97
  98     v = PyString_FromStringAndSize(NULL, (int)len);
  99     if (v == NULL)
 100         return NULL;
 101     p = PyString_AS_STRING(v);
 102     for (i = 0; i < len; i++) {
 103         register char ch = string[i];
 104         if (ch == ' ')
 105             ch = '-';
 106         else
 107             ch = tolower(ch);
 108         p[i] = ch;
 109     }
 110     return v;
 111 }
 112
 113 /* Lookup the given encoding and return a tuple providing the codec
 114    facilities.
 115
 116    The encoding string is looked up converted to all lower-case
 117    characters. This makes encodings looked up through this mechanism
 118    effectively case-insensitive.
 119
 120    If no codec is found, a LookupError is set and NULL returned.
 121
 122    As side effect, this tries to load the encodings package, if not
 123    yet done. This is part of the lazy load strategy for the encodings
 124    package.
 125
 126 */
 127
 128 PyObject *_PyCodec_Lookup(const char *encoding)
 129 {
 130     PyObject *result, *args = NULL, *v;
 131     int i, len;
 132
 133     if (encoding == NULL) {
 134         PyErr_BadArgument();
 135         goto onError;
 136     }
 137     if (_PyCodec_SearchCache == NULL ||
 138         _PyCodec_SearchPath == NULL) {
 139         PyErr_SetString(PyExc_SystemError,
 140                         "codec module not properly initialized");
 141         goto onError;
 142     }
 143     if (!import_encodings_called) {
 144         if (import_encodings())
 145             goto onError;
 146     }
 147
 148     /* Convert the encoding to a normalized Python string: all
 149        characters are converted to lower case, spaces and hyphens are
 150        replaced with underscores. */
 151     v = normalizestring(encoding);
 152     if (v == NULL)
 153         goto onError;
 154     PyString_InternInPlace(&v);
 155
 156     /* First, try to lookup the name in the registry dictionary */
 157     result = PyDict_GetItem(_PyCodec_SearchCache, v);
 158     if (result != NULL) {
 159         Py_INCREF(result);
 160         Py_DECREF(v);
 161         return result;
 162     }
 163
 164     /* Next, scan the search functions in order of registration */
 165     args = PyTuple_New(1);
 166     if (args == NULL)
 167         goto onError;
 168     PyTuple_SET_ITEM(args,0,v);
 169
 170     len = PyList_Size(_PyCodec_SearchPath);
 171     if (len < 0)
 172         goto onError;
 173     if (len == 0) {
 174         PyErr_SetString(PyExc_LookupError,
 175                         "no codec search functions registered: "
 176                         "can't find encoding");
 177         goto onError;
 178     }
 179
 180     for (i = 0; i < len; i++) {
 181         PyObject *func;
 182
 183         func = PyList_GetItem(_PyCodec_SearchPath, i);
 184         if (func == NULL)
 185             goto onError;
 186         result = PyEval_CallObject(func, args);
 187         if (result == NULL)
 188             goto onError;
 189         if (result == Py_None) {
 190             Py_DECREF(result);
 191             continue;
 192         }
 193         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
 194             PyErr_SetString(PyExc_TypeError,
 195                             "codec search functions must return 4-tuples");
 196             Py_DECREF(result);
 197             goto onError;
 198         }
 199         break;
 200     }
 201     if (i == len) {
 202         /* XXX Perhaps we should cache misses too ? */
 203         PyErr_SetString(PyExc_LookupError,
 204                         "unknown encoding");
 205         goto onError;
 206     }
 207
 208     /* Cache and return the result */
 209     PyDict_SetItem(_PyCodec_SearchCache, v, result);
 210     Py_DECREF(args);
 211     return result;
 212
 213  onError:
 214     Py_XDECREF(args);
 215     return NULL;
 216 }
 217
 218 static
 219 PyObject *args_tuple(PyObject *object,
 220                      const char *errors)
 221 {
 222     PyObject *args;
 223
 224     args = PyTuple_New(1 + (errors != NULL));
 225     if (args == NULL)
 226         return NULL;
 227     Py_INCREF(object);
 228     PyTuple_SET_ITEM(args,0,object);
 229     if (errors) {
 230         PyObject *v;
 231
 232         v = PyString_FromString(errors);
 233         if (v == NULL) {
 234             Py_DECREF(args);
 235             return NULL;
 236         }
 237         PyTuple_SET_ITEM(args, 1, v);
 238     }
 239     return args;
 240 }
 241
 242 /* Build a codec by calling factory(stream[,errors]) or just
 243    factory(errors) depending on whether the given parameters are
 244    non-NULL. */
 245
 246 static
 247 PyObject *build_stream_codec(PyObject *factory,
 248                              PyObject *stream,
 249                              const char *errors)
 250 {
 251     PyObject *args, *codec;
 252
 253     args = args_tuple(stream, errors);
 254     if (args == NULL)
 255         return NULL;
 256
 257     codec = PyEval_CallObject(factory, args);
 258     Py_DECREF(args);
 259     return codec;
 260 }
 261
 262 /* Convenience APIs to query the Codec registry.
 263
 264    All APIs return a codec object with incremented refcount.
 265
 266  */
 267
 268 PyObject *PyCodec_Encoder(const char *encoding)
 269 {
 270     PyObject *codecs;
 271     PyObject *v;
 272
 273     codecs = _PyCodec_Lookup(encoding);
 274     if (codecs == NULL)
 275         goto onError;
 276     v = PyTuple_GET_ITEM(codecs,0);
 277     Py_INCREF(v);
 278     return v;
 279
 280  onError:
 281     return NULL;
 282 }
 283
 284 PyObject *PyCodec_Decoder(const char *encoding)
 285 {
 286     PyObject *codecs;
 287     PyObject *v;
 288
 289     codecs = _PyCodec_Lookup(encoding);
 290     if (codecs == NULL)
 291         goto onError;
 292     v = PyTuple_GET_ITEM(codecs,1);
 293     Py_INCREF(v);
 294     return v;
 295
 296  onError:
 297     return NULL;
 298 }
 299
 300 PyObject *PyCodec_StreamReader(const char *encoding,
 301                                PyObject *stream,
 302                                const char *errors)
 303 {
 304     PyObject *codecs;
 305
 306     codecs = _PyCodec_Lookup(encoding);
 307     if (codecs == NULL)
 308         goto onError;
 309     return build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
 310
 311  onError:
 312     return NULL;
 313 }
 314
 315 PyObject *PyCodec_StreamWriter(const char *encoding,
 316                                PyObject *stream,
 317                                const char *errors)
 318 {
 319     PyObject *codecs;
 320
 321     codecs = _PyCodec_Lookup(encoding);
 322     if (codecs == NULL)
 323         goto onError;
 324     return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
 325
 326  onError:
 327     return NULL;
 328 }
 329
 330 /* Encode an object (e.g. an Unicode object) using the given encoding
 331    and return the resulting encoded object (usually a Python string).
 332
 333    errors is passed to the encoder factory as argument if non-NULL. */
 334
 335 PyObject *PyCodec_Encode(PyObject *object,
 336                          const char *encoding,
 337                          const char *errors)
 338 {
 339     PyObject *encoder = NULL;
 340     PyObject *args = NULL, *result;
 341     PyObject *v;
 342
 343     encoder = PyCodec_Encoder(encoding);
 344     if (encoder == NULL)
 345         goto onError;
 346
 347     args = args_tuple(object, errors);
 348     if (args == NULL)
 349         goto onError;
 350
 351     result = PyEval_CallObject(encoder,args);
 352     if (result == NULL)
 353         goto onError;
 354
 355     if (!PyTuple_Check(result) ||
 356         PyTuple_GET_SIZE(result) != 2) {
 357         PyErr_SetString(PyExc_TypeError,
 358                         "encoder must return a tuple (object,integer)");
 359         goto onError;
 360     }
 361     v = PyTuple_GET_ITEM(result,0);
 362     Py_INCREF(v);
 363     /* We don't check or use the second (integer) entry. */
 364
 365     Py_DECREF(args);
 366     Py_DECREF(encoder);
 367     Py_DECREF(result);
 368     return v;
 369
 370  onError:
 371     Py_XDECREF(args);
 372     Py_XDECREF(encoder);
 373     return NULL;
 374 }
 375
 376 /* Decode an object (usually a Python string) using the given encoding
 377    and return an equivalent object (e.g. an Unicode object).
 378
 379    errors is passed to the decoder factory as argument if non-NULL. */
 380
 381 PyObject *PyCodec_Decode(PyObject *object,
 382                          const char *encoding,
 383                          const char *errors)
 384 {
 385     PyObject *decoder = NULL;
 386     PyObject *args = NULL, *result = NULL;
 387     PyObject *v;
 388
 389     decoder = PyCodec_Decoder(encoding);
 390     if (decoder == NULL)
 391         goto onError;
 392
 393     args = args_tuple(object, errors);
 394     if (args == NULL)
 395         goto onError;
 396
 397     result = PyEval_CallObject(decoder,args);
 398     if (result == NULL)
 399         goto onError;
 400     if (!PyTuple_Check(result) ||
 401         PyTuple_GET_SIZE(result) != 2) {
 402         PyErr_SetString(PyExc_TypeError,
 403                         "decoder must return a tuple (object,integer)");
 404         goto onError;
 405     }
 406     v = PyTuple_GET_ITEM(result,0);
 407     Py_INCREF(v);
 408     /* We don't check or use the second (integer) entry. */
 409
 410     Py_DECREF(args);
 411     Py_DECREF(decoder);
 412     Py_DECREF(result);
 413     return v;
 414
 415  onError:
 416     Py_XDECREF(args);
 417     Py_XDECREF(decoder);
 418     Py_XDECREF(result);
 419     return NULL;
 420 }
 421
 422 void _PyCodecRegistry_Init(void)
 423 {
 424     if (_PyCodec_SearchPath == NULL)
 425         _PyCodec_SearchPath = PyList_New(0);
 426     if (_PyCodec_SearchCache == NULL)
 427         _PyCodec_SearchCache = PyDict_New();
 428     if (_PyCodec_SearchPath == NULL ||
 429         _PyCodec_SearchCache == NULL)
 430         Py_FatalError("can't initialize codec registry");
 431 }
 432
 433 void _PyCodecRegistry_Fini(void)
 434 {
 435     Py_XDECREF(_PyCodec_SearchPath);
 436     _PyCodec_SearchPath = NULL;
 437     Py_XDECREF(_PyCodec_SearchCache);
 438     _PyCodec_SearchCache = NULL;
 439 }