lib/strutil/tokenize.c

   1 /*
   2    Parse string into tokens.
   3
   4    Copyright (C) 2024
   5    Free Software Foundation, Inc.
   6
   7    Written by:
   8    Andrew Borodin <aborodin@vmail.ru> 2010-2024
   9
  10    The str_tokenize() and str_tokenize_word routines are mostly from
  11    GNU readline-8.2.
  12
  13    This file is part of the Midnight Commander.
  14
  15    The Midnight Commander is free software: you can redistribute it
  16    and/or modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation, either version 3 of the License,
  18    or (at your option) any later version.
  19
  20    The Midnight Commander is distributed in the hope that it will be useful,
  21    but WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23    GNU General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  27  */
  28
  29 /** \file tokenize.c
  30  *  \brief Source: parse string into tokens.
  31  */
  32
  33 #include <config.h>
  34
  35 #include <stdlib.h>
  36 #include <string.h>
  37
  38 #include "lib/global.h"
  39 #include "lib/util.h"           /* whiteness() */
  40
  41 #include "lib/strutil.h"
  42
  43 /*** global variables ****************************************************************************/
  44
  45 /*** file scope macro definitions ****************************************************************/
  46
  47 #define WORD_DELIMITERS " \t\n;&()|<>"
  48 #define QUOTE_CHARACTERS "\"'`"
  49
  50 #define slashify_in_quotes "\\`\"$"
  51
  52 #define member(c, s) ((c != '\0') ? (strchr ((s), (c)) != NULL) : FALSE)
  53
  54 /*** file scope type declarations ****************************************************************/
  55
  56 /*** forward declarations (file scope functions) *************************************************/
  57
  58 /*** file scope variables ************************************************************************/
  59
  60 /* --------------------------------------------------------------------------------------------- */
  61 /*** file scope functions ************************************************************************/
  62 /* --------------------------------------------------------------------------------------------- */
  63
  64 /*
  65  * Based on history_tokenize_word() from GNU readline-8.2
  66  */
  67 static int
  68 str_tokenize_word (const char *string, int start)
  69 {
  70     int i = start;
  71     char delimiter = '\0';
  72     char delimopen = '\0';
  73     int nestdelim = 0;
  74
  75     if (member (string[i], "()\n"))     /* XXX - included \n, but why? been here forever */
  76         return (i + 1);
  77
  78     if (g_ascii_isdigit (string[i]))
  79     {
  80         int j;
  81
  82         for (j = i; string[j] != '\0' && g_ascii_isdigit (string[j]); j++)
  83             ;
  84
  85         if (string[j] == '\0')
  86             return j;
  87
  88         if (string[j] == '<' || string[j] == '>')
  89             i = j;              /* digit sequence is a file descriptor */
  90         else
  91         {
  92             i = j;              /* digit sequence is part of a word */
  93             goto get_word;
  94         }
  95     }
  96
  97     if (member (string[i], "<>;&|"))
  98     {
  99         char peek = string[i + 1];
 100
 101         if (peek == string[i])
 102         {
 103             if (peek == '<' && (string[i + 2] == '-' || string[i + 2] == '<'))
 104                 i++;
 105             return (i + 2);
 106         }
 107
 108         if (peek == '&' && (string[i] == '>' || string[i] == '<'))
 109         {
 110             int j;
 111
 112             /* file descriptor */
 113             for (j = i + 2; string[j] != '\0' && g_ascii_isdigit (string[j]); j++)
 114                 ;
 115             if (string[j] == '-')       /* <&[digits]-, >&[digits]- */
 116                 j++;
 117             return j;
 118         }
 119
 120         if ((peek == '>' && string[i] == '&') || (peek == '|' && string[i] == '>'))
 121             return (i + 2);
 122
 123         /* XXX - process substitution -- separated out for later -- bash-4.2 */
 124         if (peek == '(' && (string[i] == '>' || string[i] == '<'))
 125         {
 126             /* ) */
 127             i += 2;
 128             delimopen = '(';
 129             delimiter = ')';
 130             nestdelim = 1;
 131             goto get_word;
 132         }
 133
 134         return (i + 1);
 135     }
 136
 137   get_word:
 138     /* Get word from string + i; */
 139
 140     if (delimiter == '\0' && member (string[i], QUOTE_CHARACTERS))
 141     {
 142         delimiter = string[i];
 143         i++;
 144     }
 145
 146     for (; string[i] != '\0'; i++)
 147     {
 148         if (string[i] == '\\' && string[i + 1] == '\n')
 149         {
 150             i++;
 151             continue;
 152         }
 153
 154         if (string[i] == '\\' && delimiter != '\'' &&
 155             (delimiter != '"' || member (string[i], slashify_in_quotes)))
 156         {
 157             i++;
 158             continue;
 159         }
 160
 161         /* delimiter must be set and set to something other than a quote if
 162            nestdelim is set, so these tests are safe. */
 163         if (nestdelim != 0 && string[i] == delimopen)
 164         {
 165             nestdelim++;
 166             continue;
 167         }
 168         if (nestdelim != 0 && string[i] == delimiter)
 169         {
 170             nestdelim--;
 171             if (nestdelim == 0)
 172                 delimiter = '\0';
 173             continue;
 174         }
 175
 176         if (delimiter != '\0' && string[i] == delimiter)
 177         {
 178             delimiter = '\0';
 179             continue;
 180         }
 181
 182         /* Command and process substitution; shell extended globbing patterns */
 183         if (nestdelim == 0 && delimiter == '\0' && member (string[i], "<>$!@?+*")
 184             && string[i + 1] == '(')
 185         {
 186             /* ) */
 187             i += 2;
 188             delimopen = '(';
 189             delimiter = ')';
 190             nestdelim = 1;
 191             continue;
 192         }
 193
 194         if (delimiter == '\0' && member (string[i], WORD_DELIMITERS))
 195             break;
 196
 197         if (delimiter == '\0' && member (string[i], QUOTE_CHARACTERS))
 198             delimiter = string[i];
 199     }
 200
 201     return i;
 202 }
 203
 204 /* --------------------------------------------------------------------------------------------- */
 205 /*** public functions ****************************************************************************/
 206 /* --------------------------------------------------------------------------------------------- */
 207
 208 /* Parse string into tokens.
 209  *
 210  * Based on history_tokenize_internal() from GNU readline-8.2
 211  */
 212 GPtrArray *
 213 str_tokenize (const char *string)
 214 {
 215     GPtrArray *result = NULL;
 216     int i = 0;
 217
 218     /* Get a token, and stuff it into RESULT.  The tokens are split
 219        exactly where the shell would split them. */
 220     while (string[i] != '\0')
 221     {
 222         int start;
 223
 224         /* Skip leading whitespace */
 225         for (; string[i] != '\0' && whiteness (string[i]); i++)
 226             ;
 227
 228         if (string[i] == '\0')
 229             return result;
 230
 231         start = i;
 232         i = str_tokenize_word (string, start);
 233
 234         /* If we have a non-whitespace delimiter character (which would not be
 235            skipped by the loop above), use it and any adjacent delimiters to
 236            make a separate field.  Any adjacent white space will be skipped the
 237            next time through the loop. */
 238         if (i == start)
 239             for (i++; string[i] != '\0' && member (string[i], WORD_DELIMITERS); i++)
 240                 ;
 241
 242         if (result == NULL)
 243             result = g_ptr_array_new ();
 244
 245         g_ptr_array_add (result, g_strndup (string + start, i - start));
 246     }
 247
 248     return result;
 249 }
 250
 251 /* --------------------------------------------------------------------------------------------- */