tex.c

   1 /*
   2  *       $Id: tex.c 666 2008-05-15 17:47:31Z dfishburn $
   3  *
   4  *       Copyright (c) 2008, David Fishburn
   5  *
   6  *       This source code is released for free distribution under the terms of the
   7  *       GNU General Public License.
   8  *
   9  *       This module contains functions for generating tags for TeX language files.
  10  *
  11  *       Tex language reference:
  12  *               http://en.wikibooks.org/wiki/TeX#The_Structure_of_TeX
  13  */
  14
  15 /*
  16  *       INCLUDE FILES
  17  */
  18 #include "general.h"    /* must always come first */
  19 #include <ctype.h>      /* to define isalpha () */
  20 #include <setjmp.h>
  21 #ifdef DEBUG
  22 #include <stdio.h>
  23 #endif
  24
  25 #include "debug.h"
  26 #include "entry.h"
  27 #include "keyword.h"
  28 #include "parse.h"
  29 #include "read.h"
  30 #include "routines.h"
  31 #include "vstring.h"
  32
  33 /*
  34  *       MACROS
  35  */
  36 #define isType(token,t)         (boolean) ((token)->type == (t))
  37 #define isKeyword(token,k)      (boolean) ((token)->keyword == (k))
  38
  39 /*
  40  *       DATA DECLARATIONS
  41  */
  42
  43 typedef enum eException { ExceptionNone, ExceptionEOF } exception_t;
  44
  45 /*
  46  * Used to specify type of keyword.
  47  */
  48 typedef enum eKeywordId {
  49         KEYWORD_NONE = -1,
  50         KEYWORD_chapter,
  51         KEYWORD_section,
  52         KEYWORD_subsection,
  53         KEYWORD_subsubsection,
  54         KEYWORD_part,
  55         KEYWORD_paragraph,
  56         KEYWORD_subparagraph,
  57         KEYWORD_include
  58 } keywordId;
  59
  60 /*      Used to determine whether keyword is valid for the token language and
  61  *      what its ID is.
  62  */
  63 typedef struct sKeywordDesc {
  64         const char *name;
  65         keywordId id;
  66 } keywordDesc;
  67
  68 typedef enum eTokenType {
  69         TOKEN_UNDEFINED,
  70         TOKEN_CHARACTER,
  71         TOKEN_CLOSE_PAREN,
  72         TOKEN_COMMA,
  73         TOKEN_KEYWORD,
  74         TOKEN_OPEN_PAREN,
  75         TOKEN_IDENTIFIER,
  76         TOKEN_STRING,
  77         TOKEN_OPEN_CURLY,
  78         TOKEN_CLOSE_CURLY,
  79         TOKEN_OPEN_SQUARE,
  80         TOKEN_CLOSE_SQUARE,
  81         TOKEN_QUESTION_MARK,
  82         TOKEN_STAR
  83 } tokenType;
  84
  85 typedef struct sTokenInfo {
  86         tokenType               type;
  87         keywordId               keyword;
  88         vString *               string;
  89         vString *               scope;
  90         unsigned long   lineNumber;
  91         fpos_t                  filePosition;
  92 } tokenInfo;
  93
  94 /*
  95  *      DATA DEFINITIONS
  96  */
  97
  98 static langType Lang_js;
  99
 100 static jmp_buf Exception;
 101
 102 typedef enum {
 103         TEXTAG_CHAPTER,
 104         TEXTAG_SECTION,
 105         TEXTAG_SUBSECTION,
 106         TEXTAG_SUBSUBSECTION,
 107         TEXTAG_PART,
 108         TEXTAG_PARAGRAPH,
 109         TEXTAG_SUBPARAGRAPH,
 110         TEXTAG_INCLUDE,
 111         TEXTAG_COUNT
 112 } texKind;
 113
 114 static kindOption TexKinds [] = {
 115         { TRUE,  'c', "chapter",                  "chapters"               },
 116         { TRUE,  's', "section",                  "sections"               },
 117         { TRUE,  'u', "subsection",               "subsections"            },
 118         { TRUE,  'b', "subsubsection",    "subsubsections"         },
 119         { TRUE,  'p', "part",                     "parts"                          },
 120         { TRUE,  'P', "paragraph",                "paragraphs"             },
 121         { TRUE,  'G', "subparagraph",     "subparagraphs"          },
 122         { TRUE,  'i', "include",                  "includes"               }
 123 };
 124
 125 static const keywordDesc TexKeywordTable [] = {
 126         /* keyword                      keyword ID */
 127         { "chapter",            KEYWORD_chapter                         },
 128         { "section",            KEYWORD_section                         },
 129         { "subsection",         KEYWORD_subsection                      },
 130         { "subsubsection",      KEYWORD_subsubsection           },
 131         { "part",                       KEYWORD_part                            },
 132         { "paragraph",          KEYWORD_paragraph                       },
 133         { "subparagraph",       KEYWORD_subparagraph            },
 134         { "include",            KEYWORD_include                         }
 135 };
 136
 137 /*
 138  *       FUNCTION DEFINITIONS
 139  */
 140
 141 static boolean isIdentChar (const int c)
 142 {
 143         return (boolean)
 144                 (isalpha (c) || isdigit (c) || c == '$' ||
 145                   c == '_' || c == '#' || c == '-' || c == '.');
 146 }
 147
 148 static void buildTexKeywordHash (void)
 149 {
 150         const size_t count = sizeof (TexKeywordTable) /
 151                 sizeof (TexKeywordTable [0]);
 152         size_t i;
 153         for (i = 0      ;  i < count  ;  ++i)
 154         {
 155                 const keywordDesc* const p = &TexKeywordTable [i];
 156                 addKeyword (p->name, Lang_js, (int) p->id);
 157         }
 158 }
 159
 160 static tokenInfo *newToken (void)
 161 {
 162         tokenInfo *const token = xMalloc (1, tokenInfo);
 163
 164         token->type                     = TOKEN_UNDEFINED;
 165         token->keyword          = KEYWORD_NONE;
 166         token->string           = vStringNew ();
 167         token->scope            = vStringNew ();
 168         token->lineNumber   = getSourceLineNumber ();
 169         token->filePosition = getInputFilePosition ();
 170
 171         return token;
 172 }
 173
 174 static void deleteToken (tokenInfo *const token)
 175 {
 176         vStringDelete (token->string);
 177         vStringDelete (token->scope);
 178         eFree (token);
 179 }
 180
 181 /*
 182  *       Tag generation functions
 183  */
 184
 185 static void makeConstTag (tokenInfo *const token, const texKind kind)
 186 {
 187         if (TexKinds [kind].enabled )
 188         {
 189                 const char *const name = vStringValue (token->string);
 190                 tagEntryInfo e;
 191                 initTagEntry (&e, name);
 192
 193                 e.lineNumber   = token->lineNumber;
 194                 e.filePosition = token->filePosition;
 195                 e.kindName         = TexKinds [kind].name;
 196                 e.kind             = TexKinds [kind].letter;
 197
 198                 makeTagEntry (&e);
 199         }
 200 }
 201
 202 static void makeTexTag (tokenInfo *const token, texKind kind)
 203 {
 204         vString *       fulltag;
 205
 206         if (TexKinds [kind].enabled)
 207         {
 208                 /*
 209                  * If a scope has been added to the token, change the token
 210                  * string to include the scope when making the tag.
 211                  */
 212                 if ( vStringLength (token->scope) > 0 )
 213                 {
 214                         fulltag = vStringNew ();
 215                         vStringCopy (fulltag, token->scope);
 216                         vStringCatS (fulltag, ".");
 217                         vStringCatS (fulltag, vStringValue (token->string));
 218                         vStringTerminate (fulltag);
 219                         vStringCopy (token->string, fulltag);
 220                         vStringDelete (fulltag);
 221                 }
 222                 makeConstTag (token, kind);
 223         }
 224 }
 225
 226 /*
 227  *       Parsing functions
 228  */
 229
 230 static void parseString (vString *const string, const int delimiter)
 231 {
 232         boolean end = FALSE;
 233         while (! end)
 234         {
 235                 int c = fileGetc ();
 236                 if (c == EOF)
 237                         end = TRUE;
 238                 else if (c == '\\')
 239                 {
 240                         c = fileGetc(); /* This maybe a ' or ". */
 241                         vStringPut (string, c);
 242                 }
 243                 else if (c == delimiter)
 244                         end = TRUE;
 245                 else
 246                         vStringPut (string, c);
 247         }
 248         vStringTerminate (string);
 249 }
 250
 251 /*
 252  *      Read a C identifier beginning with "firstChar" and places it into
 253  *      "name".
 254  */
 255 static void parseIdentifier (vString *const string, const int firstChar)
 256 {
 257         int c = firstChar;
 258         Assert (isIdentChar (c));
 259         do
 260         {
 261                 vStringPut (string, c);
 262                 c = fileGetc ();
 263         } while (isIdentChar (c));
 264
 265         vStringTerminate (string);
 266         if (!isspace (c))
 267                 fileUngetc (c);         /* unget non-identifier character */
 268 }
 269
 270 static void readToken (tokenInfo *const token)
 271 {
 272         int c;
 273
 274         token->type                     = TOKEN_UNDEFINED;
 275         token->keyword          = KEYWORD_NONE;
 276         vStringClear (token->string);
 277
 278 getNextChar:
 279         do
 280         {
 281                 c = fileGetc ();
 282                 token->lineNumber   = getSourceLineNumber ();
 283                 token->filePosition = getInputFilePosition ();
 284         }
 285         while (c == '\t'  ||  c == ' ' ||  c == '\n');
 286
 287         switch (c)
 288         {
 289                 case EOF: longjmp (Exception, (int)ExceptionEOF);       break;
 290                 case '(': token->type = TOKEN_OPEN_PAREN;                       break;
 291                 case ')': token->type = TOKEN_CLOSE_PAREN;                      break;
 292                 case ',': token->type = TOKEN_COMMA;                            break;
 293                 case '{': token->type = TOKEN_OPEN_CURLY;                       break;
 294                 case '}': token->type = TOKEN_CLOSE_CURLY;                      break;
 295                 case '[': token->type = TOKEN_OPEN_SQUARE;                      break;
 296                 case ']': token->type = TOKEN_CLOSE_SQUARE;                     break;
 297                 case '*': token->type = TOKEN_STAR;                                     break;
 298
 299                 case '\'':
 300                 case '"':
 301                                   token->type = TOKEN_STRING;
 302                                   parseString (token->string, c);
 303                                   token->lineNumber = getSourceLineNumber ();
 304                                   token->filePosition = getInputFilePosition ();
 305                                   break;
 306
 307                 case '\\':
 308                                   /*
 309                                    * All Tex tags start with a backslash.
 310                                    * Check if the next character is an alpha character
 311                                    * else it is not a potential tex tag.
 312                                    */
 313                                   c = fileGetc ();
 314                                   if (! isalpha (c))
 315                                           fileUngetc (c);
 316                                   else
 317                                   {
 318                                           parseIdentifier (token->string, c);
 319                                           token->lineNumber = getSourceLineNumber ();
 320                                           token->filePosition = getInputFilePosition ();
 321                                           token->keyword = analyzeToken (token->string, Lang_js);
 322                                           if (isKeyword (token, KEYWORD_NONE))
 323                                                   token->type = TOKEN_IDENTIFIER;
 324                                           else
 325                                                   token->type = TOKEN_KEYWORD;
 326                                   }
 327                                   break;
 328
 329                 case '%':
 330                                   fileSkipToCharacter ('\n'); /* % are single line comments */
 331                                   goto getNextChar;
 332                                   break;
 333
 334                 default:
 335                                   if (! isIdentChar (c))
 336                                           token->type = TOKEN_UNDEFINED;
 337                                   else
 338                                   {
 339                                           parseIdentifier (token->string, c);
 340                                           token->lineNumber = getSourceLineNumber ();
 341                                           token->filePosition = getInputFilePosition ();
 342                                           token->type = TOKEN_IDENTIFIER;
 343                                   }
 344                                   break;
 345         }
 346 }
 347
 348 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
 349 {
 350         dest->lineNumber = src->lineNumber;
 351         dest->filePosition = src->filePosition;
 352         dest->type = src->type;
 353         dest->keyword = src->keyword;
 354         vStringCopy (dest->string, src->string);
 355         vStringCopy (dest->scope, src->scope);
 356 }
 357
 358 /*
 359  *       Scanning functions
 360  */
 361
 362 static boolean parseTag (tokenInfo *const token, texKind kind)
 363 {
 364         tokenInfo *const name = newToken ();
 365         vString *       fullname;
 366         boolean         useLongName = TRUE;
 367
 368         fullname = vStringNew ();
 369         vStringClear (fullname);
 370
 371         /*
 372          * Tex tags are of these formats:
 373          *   \keyword{any number of words}
 374          *   \keyword[short desc]{any number of words}
 375          *   \keyword*[short desc]{any number of words}
 376          *
 377          * When a keyword is found, loop through all words within
 378          * the curly braces for the tag name.
 379          */
 380
 381         if (isType (token, TOKEN_KEYWORD))
 382         {
 383                 copyToken (name, token);
 384                 readToken (token);
 385         }
 386
 387         if (isType (token, TOKEN_OPEN_SQUARE))
 388         {
 389                 useLongName = FALSE;
 390
 391                 readToken (token);
 392                 while (! isType (token, TOKEN_CLOSE_SQUARE) )
 393                 {
 394                         if (isType (token, TOKEN_IDENTIFIER))
 395                         {
 396                                 if (fullname->length > 0)
 397                                         vStringCatS (fullname, " ");
 398                                 vStringCatS (fullname, vStringValue (token->string));
 399                         }
 400                         readToken (token);
 401                 }
 402                 vStringTerminate (fullname);
 403                 vStringCopy (name->string, fullname);
 404                 makeTexTag (name, kind);
 405         }
 406
 407         if (isType (token, TOKEN_STAR))
 408         {
 409                 readToken (token);
 410         }
 411
 412         if (isType (token, TOKEN_OPEN_CURLY))
 413         {
 414                 readToken (token);
 415                 while (! isType (token, TOKEN_CLOSE_CURLY) )
 416                 {
 417                         /* if (isType (token, TOKEN_IDENTIFIER) && useLongName) */
 418                         if (useLongName)
 419                         {
 420                                 if (fullname->length > 0)
 421                                         vStringCatS (fullname, " ");
 422                                 vStringCatS (fullname, vStringValue (token->string));
 423                         }
 424                         readToken (token);
 425                 }
 426                 if (useLongName)
 427                 {
 428                         vStringTerminate (fullname);
 429                         vStringCopy (name->string, fullname);
 430                         makeTexTag (name, kind);
 431                 }
 432         }
 433
 434         deleteToken (name);
 435         vStringDelete (fullname);
 436         return TRUE;
 437 }
 438
 439 static void parseTexFile (tokenInfo *const token)
 440 {
 441         do
 442         {
 443                 readToken (token);
 444
 445                 if (isType (token, TOKEN_KEYWORD))
 446                 {
 447                         switch (token->keyword)
 448                         {
 449                                 case KEYWORD_chapter:
 450                                         parseTag (token, TEXTAG_CHAPTER);
 451                                         break;
 452                                 case KEYWORD_section:
 453                                         parseTag (token, TEXTAG_SECTION);
 454                                         break;
 455                                 case KEYWORD_subsection:
 456                                         parseTag (token, TEXTAG_SUBSUBSECTION);
 457                                         break;
 458                                 case KEYWORD_subsubsection:
 459                                         parseTag (token, TEXTAG_SUBSUBSECTION);
 460                                         break;
 461                                 case KEYWORD_part:
 462                                         parseTag (token, TEXTAG_PART);
 463                                         break;
 464                                 case KEYWORD_paragraph:
 465                                         parseTag (token, TEXTAG_PARAGRAPH);
 466                                         break;
 467                                 case KEYWORD_subparagraph:
 468                                         parseTag (token, TEXTAG_SUBPARAGRAPH);
 469                                         break;
 470                                 case KEYWORD_include:
 471                                         parseTag (token, TEXTAG_INCLUDE);
 472                                         break;
 473                                 default:
 474                                         break;
 475                         }
 476                 }
 477         } while (TRUE);
 478 }
 479
 480 static void initialize (const langType language)
 481 {
 482         Assert (sizeof (TexKinds) / sizeof (TexKinds [0]) == TEXTAG_COUNT);
 483         Lang_js = language;
 484         buildTexKeywordHash ();
 485 }
 486
 487 static void findTexTags (void)
 488 {
 489         tokenInfo *const token = newToken ();
 490         exception_t exception;
 491
 492         exception = (exception_t) (setjmp (Exception));
 493         while (exception == ExceptionNone)
 494                 parseTexFile (token);
 495
 496         deleteToken (token);
 497 }
 498
 499 /* Create parser definition stucture */
 500 extern parserDefinition* TexParser (void)
 501 {
 502         static const char *const extensions [] = { "tex", NULL };
 503         parserDefinition *const def = parserNew ("Tex");
 504         def->extensions = extensions;
 505         /*
 506          * New definitions for parsing instead of regex
 507          */
 508         def->kinds              = TexKinds;
 509         def->kindCount  = KIND_COUNT (TexKinds);
 510         def->parser             = findTexTags;
 511         def->initialize = initialize;
 512
 513         return def;
 514 }
 515 /* vi:set tabstop=4 shiftwidth=4 noexpandtab: */