d: Merge upstream dmd 4d1bfcf14, druntime 9ba9a6ae, phobos c0cc5e917.
[official-gcc.git] / gcc / d / dmd / lexer.d
blobb778bc82d77ca61bd5993dae22f4b8dbf5e09776
1 /**
2 * Implements the lexical analyzer, which converts source code into lexical tokens.
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
6 * Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
14 module dmd.lexer;
16 import core.stdc.ctype;
17 import core.stdc.errno;
18 import core.stdc.stdarg;
19 import core.stdc.stdio;
20 import core.stdc.stdlib : getenv;
21 import core.stdc.string;
22 import core.stdc.time;
24 import dmd.entity;
25 import dmd.errors;
26 import dmd.globals;
27 import dmd.id;
28 import dmd.identifier;
29 import dmd.root.array;
30 import dmd.root.ctfloat;
31 import dmd.common.outbuffer;
32 import dmd.root.port;
33 import dmd.root.rmem;
34 import dmd.root.string;
35 import dmd.root.utf;
36 import dmd.tokens;
37 import dmd.utils;
39 nothrow:
41 version (DMDLIB)
43 version = LocOffset;
46 /***********************************************************
48 class Lexer
50 private __gshared OutBuffer stringbuffer;
52 Loc scanloc; // for error messages
53 Loc prevloc; // location of token before current
55 const(char)* p; // current character
57 Token token;
59 // For ImportC
60 bool Ccompile; /// true if compiling ImportC
62 // The following are valid only if (Ccompile == true)
63 ubyte boolsize; /// size of a C _Bool, default 1
64 ubyte shortsize; /// size of a C short, default 2
65 ubyte intsize; /// size of a C int, default 4
66 ubyte longsize; /// size of C long, 4 or 8
67 ubyte long_longsize; /// size of a C long long, default 8
68 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
69 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
71 private
73 const(char)* base; // pointer to start of buffer
74 const(char)* end; // pointer to last element of buffer
75 const(char)* line; // start of current line
77 bool doDocComment; // collect doc comment information
78 bool anyToken; // seen at least one token
79 bool commentToken; // comments are TOK.comment's
80 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's
82 version (DMDLIB)
84 bool whitespaceToken; // tokenize whitespaces
87 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
88 int lastDocLine; // last line of previous doc comment
90 Token* tokenFreelist;
93 nothrow:
95 /*********************
96 * Creates a Lexer for the source code base[begoffset..endoffset+1].
97 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
99 * Params:
100 * filename = used for error messages
101 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
102 * begoffset = starting offset into base[]
103 * endoffset = the last offset to read into base[]
104 * doDocComment = handle documentation comments
105 * commentToken = comments become TOK.comment's
107 this(const(char)* filename, const(char)* base, size_t begoffset,
108 size_t endoffset, bool doDocComment, bool commentToken) pure
110 scanloc = Loc(filename, 1, 1);
111 // debug printf("Lexer::Lexer(%p)\n", base);
112 // debug printf("lexer.filename = %s\n", filename);
113 token = Token.init;
114 this.base = base;
115 this.end = base + endoffset;
116 p = base + begoffset;
117 line = p;
118 this.doDocComment = doDocComment;
119 this.commentToken = commentToken;
120 this.tokenizeNewlines = false;
121 this.inTokenStringConstant = 0;
122 this.lastDocLine = 0;
123 //initKeywords();
124 /* If first line starts with '#!', ignore the line
126 if (p && p[0] == '#' && p[1] == '!')
128 p += 2;
129 while (1)
131 char c = *p++;
132 switch (c)
134 case 0:
135 case 0x1A:
136 p--;
137 goto case;
138 case '\n':
139 break;
140 default:
141 continue;
143 break;
145 endOfLine();
149 version (DMDLIB)
151 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
152 bool doDocComment, bool commentToken, bool whitespaceToken)
154 this(filename, base, begoffset, endoffset, doDocComment, commentToken);
155 this.whitespaceToken = whitespaceToken;
158 bool empty() const pure @property @nogc @safe
160 return front() == TOK.endOfFile;
163 TOK front() const pure @property @nogc @safe
165 return token.value;
168 void popFront()
170 nextToken();
174 /// Returns: a newly allocated `Token`.
175 Token* allocateToken() pure nothrow @safe
177 if (tokenFreelist)
179 Token* t = tokenFreelist;
180 tokenFreelist = t.next;
181 t.next = null;
182 return t;
184 return new Token();
187 /// Frees the given token by returning it to the freelist.
188 private void releaseToken(Token* token) pure nothrow @nogc @safe
190 if (mem.isGCEnabled)
191 *token = Token.init;
192 token.next = tokenFreelist;
193 tokenFreelist = token;
196 final TOK nextToken()
198 prevloc = token.loc;
199 if (token.next)
201 Token* t = token.next;
202 memcpy(&token, t, Token.sizeof);
203 releaseToken(t);
205 else
207 scan(&token);
209 //printf(token.toChars());
210 return token.value;
213 /***********************
214 * Look ahead at next token's value.
216 final TOK peekNext()
218 return peek(&token).value;
221 /***********************
222 * Look 2 tokens ahead at value.
224 final TOK peekNext2()
226 Token* t = peek(&token);
227 return peek(t).value;
230 /****************************
231 * Turn next token in buffer into a token.
232 * Params:
233 * t = the token to set the resulting Token to
235 final void scan(Token* t)
237 const lastLine = scanloc.linnum;
238 Loc startLoc;
239 t.blockComment = null;
240 t.lineComment = null;
242 while (1)
244 t.ptr = p;
245 //printf("p = %p, *p = '%c'\n",p,*p);
246 t.loc = loc();
247 switch (*p)
249 case 0:
250 case 0x1A:
251 t.value = TOK.endOfFile; // end of file
252 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
253 return;
254 case ' ':
255 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
256 while ((cast(size_t)p) % uint.sizeof)
258 if (*p != ' ')
259 goto LendSkipFourSpaces;
260 p++;
262 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
263 p += 4;
264 // Skip over any remaining space on the line.
265 while (*p == ' ')
266 p++;
267 LendSkipFourSpaces:
268 version (DMDLIB)
270 if (whitespaceToken)
272 t.value = TOK.whitespace;
273 return;
276 continue; // skip white space
277 case '\t':
278 case '\v':
279 case '\f':
280 p++;
281 version (DMDLIB)
283 if (whitespaceToken)
285 t.value = TOK.whitespace;
286 return;
289 continue; // skip white space
290 case '\r':
291 p++;
292 if (*p != '\n') // if CR stands by itself
294 endOfLine();
295 if (tokenizeNewlines)
297 t.value = TOK.endOfLine;
298 tokenizeNewlines = false;
299 return;
302 version (DMDLIB)
304 if (whitespaceToken)
306 t.value = TOK.whitespace;
307 return;
310 continue; // skip white space
311 case '\n':
312 p++;
313 endOfLine();
314 if (tokenizeNewlines)
316 t.value = TOK.endOfLine;
317 tokenizeNewlines = false;
318 return;
320 version (DMDLIB)
322 if (whitespaceToken)
324 t.value = TOK.whitespace;
325 return;
328 continue; // skip white space
329 case '0':
330 if (!isZeroSecond(p[1])) // if numeric literal does not continue
332 ++p;
333 t.unsvalue = 0;
334 t.value = TOK.int32Literal;
335 return;
337 goto Lnumber;
339 case '1': .. case '9':
340 if (!isDigitSecond(p[1])) // if numeric literal does not continue
342 t.unsvalue = *p - '0';
343 ++p;
344 t.value = TOK.int32Literal;
345 return;
347 Lnumber:
348 t.value = number(t);
349 return;
351 case '\'':
352 if (issinglechar(p[1]) && p[2] == '\'')
354 t.unsvalue = p[1]; // simple one character literal
355 t.value = TOK.charLiteral;
356 p += 3;
358 else if (Ccompile)
360 clexerCharConstant(*t, 0);
362 else
364 t.value = charConstant(t);
366 return;
368 case 'u':
369 case 'U':
370 case 'L':
371 if (!Ccompile)
372 goto case_ident;
373 if (p[1] == '\'') // C wide character constant
375 char c = *p;
376 if (c == 'L') // convert L to u or U
377 c = (wchar_tsize == 4) ? 'u' : 'U';
378 ++p;
379 clexerCharConstant(*t, c);
380 return;
382 else if (p[1] == '\"') // C wide string literal
384 const c = *p;
385 ++p;
386 escapeStringConstant(t);
387 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
388 c == 'u' ? 'w' :
389 'd';
390 return;
392 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
394 p += 2;
395 escapeStringConstant(t);
396 return;
398 goto case_ident;
400 case 'r':
401 if (Ccompile || p[1] != '"')
402 goto case_ident;
403 p++;
404 goto case '`';
405 case '`':
406 if (Ccompile)
407 goto default;
408 wysiwygStringConstant(t);
409 return;
410 case 'q':
411 if (Ccompile)
412 goto case_ident;
413 if (p[1] == '"')
415 p++;
416 delimitedStringConstant(t);
417 return;
419 else if (p[1] == '{')
421 p++;
422 tokenStringConstant(t);
423 return;
425 else
426 goto case_ident;
427 case '"':
428 escapeStringConstant(t);
429 return;
430 case 'a':
431 case 'b':
432 case 'c':
433 case 'd':
434 case 'e':
435 case 'f':
436 case 'g':
437 case 'h':
438 case 'i':
439 case 'j':
440 case 'k':
441 case 'l':
442 case 'm':
443 case 'n':
444 case 'o':
445 case 'p':
446 /*case 'q': case 'r':*/
447 case 's':
448 case 't':
449 //case 'u':
450 case 'v':
451 case 'w':
452 case 'x':
453 case 'y':
454 case 'z':
455 case 'A':
456 case 'B':
457 case 'C':
458 case 'D':
459 case 'E':
460 case 'F':
461 case 'G':
462 case 'H':
463 case 'I':
464 case 'J':
465 case 'K':
466 //case 'L':
467 case 'M':
468 case 'N':
469 case 'O':
470 case 'P':
471 case 'Q':
472 case 'R':
473 case 'S':
474 case 'T':
475 //case 'U':
476 case 'V':
477 case 'W':
478 case 'X':
479 case 'Y':
480 case 'Z':
481 case '_':
482 case_ident:
484 while (1)
486 const c = *++p;
487 if (isidchar(c))
488 continue;
489 else if (c & 0x80)
491 const s = p;
492 const u = decodeUTF();
493 if (isUniAlpha(u))
494 continue;
495 error("char 0x%04x not allowed in identifier", u);
496 p = s;
498 break;
500 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
501 t.ident = id;
502 t.value = cast(TOK)id.getValue();
504 anyToken = 1;
506 /* Different keywords for C and D
508 if (Ccompile)
510 if (t.value != TOK.identifier)
512 t.value = Ckeywords[t.value]; // filter out D keywords
515 else if (t.value >= FirstCKeyword)
516 t.value = TOK.identifier; // filter out C keywords
518 else if (*t.ptr == '_') // if special identifier token
520 // Lazy initialization
521 TimeStampInfo.initialize(t.loc);
523 if (id == Id.DATE)
525 t.ustring = TimeStampInfo.date.ptr;
526 goto Lstr;
528 else if (id == Id.TIME)
530 t.ustring = TimeStampInfo.time.ptr;
531 goto Lstr;
533 else if (id == Id.VENDOR)
535 t.ustring = global.vendor.xarraydup.ptr;
536 goto Lstr;
538 else if (id == Id.TIMESTAMP)
540 t.ustring = TimeStampInfo.timestamp.ptr;
541 Lstr:
542 t.value = TOK.string_;
543 t.postfix = 0;
544 t.len = cast(uint)strlen(t.ustring);
546 else if (id == Id.VERSIONX)
548 t.value = TOK.int64Literal;
549 t.unsvalue = global.versionNumber();
551 else if (id == Id.EOFX)
553 t.value = TOK.endOfFile;
554 // Advance scanner to end of file
555 while (!(*p == 0 || *p == 0x1A))
556 p++;
559 //printf("t.value = %d\n",t.value);
560 return;
562 case '/':
563 p++;
564 switch (*p)
566 case '=':
567 p++;
568 t.value = TOK.divAssign;
569 return;
570 case '*':
571 p++;
572 startLoc = loc();
573 while (1)
575 while (1)
577 const c = *p;
578 switch (c)
580 case '/':
581 break;
582 case '\n':
583 endOfLine();
584 p++;
585 continue;
586 case '\r':
587 p++;
588 if (*p != '\n')
589 endOfLine();
590 continue;
591 case 0:
592 case 0x1A:
593 error("unterminated /* */ comment");
594 p = end;
595 t.loc = loc();
596 t.value = TOK.endOfFile;
597 return;
598 default:
599 if (c & 0x80)
601 const u = decodeUTF();
602 if (u == PS || u == LS)
603 endOfLine();
605 p++;
606 continue;
608 break;
610 p++;
611 if (p[-2] == '*' && p - 3 != t.ptr)
612 break;
614 if (commentToken)
616 t.loc = startLoc;
617 t.value = TOK.comment;
618 return;
620 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
622 // if /** but not /**/
623 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
624 lastDocLine = scanloc.linnum;
626 continue;
627 case '/': // do // style comments
628 startLoc = loc();
629 while (1)
631 const c = *++p;
632 switch (c)
634 case '\n':
635 break;
636 case '\r':
637 if (p[1] == '\n')
638 p++;
639 break;
640 case 0:
641 case 0x1A:
642 if (commentToken)
644 p = end;
645 t.loc = startLoc;
646 t.value = TOK.comment;
647 return;
649 if (doDocComment && t.ptr[2] == '/')
651 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
652 lastDocLine = scanloc.linnum;
654 p = end;
655 t.loc = loc();
656 t.value = TOK.endOfFile;
657 return;
658 default:
659 if (c & 0x80)
661 const u = decodeUTF();
662 if (u == PS || u == LS)
663 break;
665 continue;
667 break;
669 if (commentToken)
671 version (DMDLIB) {}
672 else
674 p++;
675 endOfLine();
677 t.loc = startLoc;
678 t.value = TOK.comment;
679 return;
681 if (doDocComment && t.ptr[2] == '/')
683 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
684 lastDocLine = scanloc.linnum;
686 p++;
687 endOfLine();
688 continue;
689 case '+':
690 if (!Ccompile)
692 int nest;
693 startLoc = loc();
694 p++;
695 nest = 1;
696 while (1)
698 char c = *p;
699 switch (c)
701 case '/':
702 p++;
703 if (*p == '+')
705 p++;
706 nest++;
708 continue;
709 case '+':
710 p++;
711 if (*p == '/')
713 p++;
714 if (--nest == 0)
715 break;
717 continue;
718 case '\r':
719 p++;
720 if (*p != '\n')
721 endOfLine();
722 continue;
723 case '\n':
724 endOfLine();
725 p++;
726 continue;
727 case 0:
728 case 0x1A:
729 error("unterminated /+ +/ comment");
730 p = end;
731 t.loc = loc();
732 t.value = TOK.endOfFile;
733 return;
734 default:
735 if (c & 0x80)
737 uint u = decodeUTF();
738 if (u == PS || u == LS)
739 endOfLine();
741 p++;
742 continue;
744 break;
746 if (commentToken)
748 t.loc = startLoc;
749 t.value = TOK.comment;
750 return;
752 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
754 // if /++ but not /++/
755 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
756 lastDocLine = scanloc.linnum;
758 continue;
760 break;
761 default:
762 break;
764 t.value = TOK.div;
765 return;
766 case '.':
767 p++;
768 if (isdigit(*p))
770 /* Note that we don't allow ._1 and ._ as being
771 * valid floating point numbers.
773 p--;
774 t.value = inreal(t);
776 else if (p[0] == '.')
778 if (p[1] == '.')
780 p += 2;
781 t.value = TOK.dotDotDot;
783 else
785 p++;
786 t.value = TOK.slice;
789 else
790 t.value = TOK.dot;
791 return;
792 case '&':
793 p++;
794 if (*p == '=')
796 p++;
797 t.value = TOK.andAssign;
799 else if (*p == '&')
801 p++;
802 t.value = TOK.andAnd;
804 else
805 t.value = TOK.and;
806 return;
807 case '|':
808 p++;
809 if (*p == '=')
811 p++;
812 t.value = TOK.orAssign;
814 else if (*p == '|')
816 p++;
817 t.value = TOK.orOr;
819 else
820 t.value = TOK.or;
821 return;
822 case '-':
823 p++;
824 if (*p == '=')
826 p++;
827 t.value = TOK.minAssign;
829 else if (*p == '-')
831 p++;
832 t.value = TOK.minusMinus;
834 else if (*p == '>')
836 ++p;
837 t.value = TOK.arrow;
839 else
840 t.value = TOK.min;
841 return;
842 case '+':
843 p++;
844 if (*p == '=')
846 p++;
847 t.value = TOK.addAssign;
849 else if (*p == '+')
851 p++;
852 t.value = TOK.plusPlus;
854 else
855 t.value = TOK.add;
856 return;
857 case '<':
858 p++;
859 if (*p == '=')
861 p++;
862 t.value = TOK.lessOrEqual; // <=
864 else if (*p == '<')
866 p++;
867 if (*p == '=')
869 p++;
870 t.value = TOK.leftShiftAssign; // <<=
872 else
873 t.value = TOK.leftShift; // <<
875 else if (*p == ':' && Ccompile)
877 ++p;
878 t.value = TOK.leftBracket; // <:
880 else if (*p == '%' && Ccompile)
882 ++p;
883 t.value = TOK.leftCurly; // <%
885 else
886 t.value = TOK.lessThan; // <
887 return;
888 case '>':
889 p++;
890 if (*p == '=')
892 p++;
893 t.value = TOK.greaterOrEqual; // >=
895 else if (*p == '>')
897 p++;
898 if (*p == '=')
900 p++;
901 t.value = TOK.rightShiftAssign; // >>=
903 else if (*p == '>')
905 p++;
906 if (*p == '=')
908 p++;
909 t.value = TOK.unsignedRightShiftAssign; // >>>=
911 else
912 t.value = TOK.unsignedRightShift; // >>>
914 else
915 t.value = TOK.rightShift; // >>
917 else
918 t.value = TOK.greaterThan; // >
919 return;
920 case '!':
921 p++;
922 if (*p == '=')
924 p++;
925 t.value = TOK.notEqual; // !=
927 else
928 t.value = TOK.not; // !
929 return;
930 case '=':
931 p++;
932 if (*p == '=')
934 p++;
935 t.value = TOK.equal; // ==
937 else if (*p == '>')
939 p++;
940 t.value = TOK.goesTo; // =>
942 else
943 t.value = TOK.assign; // =
944 return;
945 case '~':
946 p++;
947 if (*p == '=')
949 p++;
950 t.value = TOK.concatenateAssign; // ~=
952 else
953 t.value = TOK.tilde; // ~
954 return;
955 case '^':
956 p++;
957 if (*p == '^')
959 p++;
960 if (*p == '=')
962 p++;
963 t.value = TOK.powAssign; // ^^=
965 else
966 t.value = TOK.pow; // ^^
968 else if (*p == '=')
970 p++;
971 t.value = TOK.xorAssign; // ^=
973 else
974 t.value = TOK.xor; // ^
975 return;
976 case '(':
977 p++;
978 t.value = TOK.leftParenthesis;
979 return;
980 case ')':
981 p++;
982 t.value = TOK.rightParenthesis;
983 return;
984 case '[':
985 p++;
986 t.value = TOK.leftBracket;
987 return;
988 case ']':
989 p++;
990 t.value = TOK.rightBracket;
991 return;
992 case '{':
993 p++;
994 t.value = TOK.leftCurly;
995 return;
996 case '}':
997 p++;
998 t.value = TOK.rightCurly;
999 return;
1000 case '?':
1001 p++;
1002 t.value = TOK.question;
1003 return;
1004 case ',':
1005 p++;
1006 t.value = TOK.comma;
1007 return;
1008 case ';':
1009 p++;
1010 t.value = TOK.semicolon;
1011 return;
1012 case ':':
1013 p++;
1014 if (*p == ':')
1016 ++p;
1017 t.value = TOK.colonColon;
1019 else if (*p == '>' && Ccompile)
1021 ++p;
1022 t.value = TOK.rightBracket;
1024 else
1025 t.value = TOK.colon;
1026 return;
1027 case '$':
1028 p++;
1029 t.value = TOK.dollar;
1030 return;
1031 case '@':
1032 p++;
1033 t.value = TOK.at;
1034 return;
1035 case '*':
1036 p++;
1037 if (*p == '=')
1039 p++;
1040 t.value = TOK.mulAssign;
1042 else
1043 t.value = TOK.mul;
1044 return;
1045 case '%':
1046 p++;
1047 if (*p == '=')
1049 p++;
1050 t.value = TOK.modAssign;
1052 else if (*p == '>' && Ccompile)
1054 ++p;
1055 t.value = TOK.rightCurly;
1057 else if (*p == ':' && Ccompile)
1059 goto case '#'; // %: means #
1061 else
1062 t.value = TOK.mod;
1063 return;
1064 case '#':
1066 // https://issues.dlang.org/show_bug.cgi?id=22825
1067 // Special token sequences are terminated by newlines,
1068 // and should not be skipped over.
1069 this.tokenizeNewlines = true;
1070 p++;
1071 if (parseSpecialTokenSequence())
1072 continue;
1073 t.value = TOK.pound;
1074 return;
1076 default:
1078 dchar c = *p;
1079 if (c & 0x80)
1081 c = decodeUTF();
1082 // Check for start of unicode identifier
1083 if (isUniAlpha(c))
1084 goto case_ident;
1085 if (c == PS || c == LS)
1087 endOfLine();
1088 p++;
1089 if (tokenizeNewlines)
1091 t.value = TOK.endOfLine;
1092 tokenizeNewlines = false;
1093 return;
1095 continue;
1098 if (c < 0x80 && isprint(c))
1099 error("character '%c' is not a valid token", c);
1100 else
1101 error("character 0x%02x is not a valid token", c);
1102 p++;
1103 continue;
1109 final Token* peek(Token* ct)
1111 Token* t;
1112 if (ct.next)
1113 t = ct.next;
1114 else
1116 t = allocateToken();
1117 scan(t);
1118 ct.next = t;
1120 return t;
1123 /*********************************
1124 * tk is on the opening (.
1125 * Look ahead and return token that is past the closing ).
1127 final Token* peekPastParen(Token* tk)
1129 //printf("peekPastParen()\n");
1130 int parens = 1;
1131 int curlynest = 0;
1132 while (1)
1134 tk = peek(tk);
1135 //tk.print();
1136 switch (tk.value)
1138 case TOK.leftParenthesis:
1139 parens++;
1140 continue;
1141 case TOK.rightParenthesis:
1142 --parens;
1143 if (parens)
1144 continue;
1145 tk = peek(tk);
1146 break;
1147 case TOK.leftCurly:
1148 curlynest++;
1149 continue;
1150 case TOK.rightCurly:
1151 if (--curlynest >= 0)
1152 continue;
1153 break;
1154 case TOK.semicolon:
1155 if (curlynest)
1156 continue;
1157 break;
1158 case TOK.endOfFile:
1159 break;
1160 default:
1161 continue;
1163 return tk;
1167 /*******************************************
1168 * Parse escape sequence.
1170 private uint escapeSequence()
1172 return Lexer.escapeSequence(token.loc, p, Ccompile);
1175 /********
1176 * Parse the given string literal escape sequence into a single character.
1177 * D https://dlang.org/spec/lex.html#escape_sequences
1178 * C11 6.4.4.4
1179 * Params:
1180 * loc = location to use for error messages
1181 * sequence = pointer to string with escape sequence to parse. Updated to
1182 * point past the end of the escape sequence
1183 * Ccompile = true for compile C11 escape sequences
1184 * Returns:
1185 * the escape sequence as a single character
1187 private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
1189 const(char)* p = sequence; // cache sequence reference on stack
1190 scope(exit) sequence = p;
1192 uint c = *p;
1193 int ndigits;
1194 switch (c)
1196 case '\'':
1197 case '"':
1198 case '?':
1199 case '\\':
1200 Lconsume:
1201 p++;
1202 break;
1203 case 'a':
1204 c = 7;
1205 goto Lconsume;
1206 case 'b':
1207 c = 8;
1208 goto Lconsume;
1209 case 'f':
1210 c = 12;
1211 goto Lconsume;
1212 case 'n':
1213 c = 10;
1214 goto Lconsume;
1215 case 'r':
1216 c = 13;
1217 goto Lconsume;
1218 case 't':
1219 c = 9;
1220 goto Lconsume;
1221 case 'v':
1222 c = 11;
1223 goto Lconsume;
1224 case 'u':
1225 ndigits = 4;
1226 goto Lhex;
1227 case 'U':
1228 ndigits = 8;
1229 goto Lhex;
1230 case 'x':
1231 ndigits = 2;
1232 Lhex:
1233 p++;
1234 c = *p;
1235 if (ishex(cast(char)c))
1237 uint v = 0;
1238 int n = 0;
1239 while (1)
1241 if (isdigit(cast(char)c))
1242 c -= '0';
1243 else if (islower(c))
1244 c -= 'a' - 10;
1245 else
1246 c -= 'A' - 10;
1247 v = v * 16 + c;
1248 c = *++p;
1249 if (++n == ndigits)
1250 break;
1251 if (!ishex(cast(char)c))
1253 .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1254 break;
1257 if (ndigits != 2 && !utf_isValidDchar(v))
1259 .error(loc, "invalid UTF character \\U%08x", v);
1260 v = '?'; // recover with valid UTF character
1262 c = v;
1264 else
1266 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1267 p++;
1269 break;
1270 case '&':
1271 if (Ccompile)
1272 goto default;
1274 // named character entity
1275 for (const idstart = ++p; 1; p++)
1277 switch (*p)
1279 case ';':
1280 c = HtmlNamedEntity(idstart, p - idstart);
1281 if (c == ~0)
1283 .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1284 c = '?';
1286 p++;
1287 break;
1288 default:
1289 if (isalpha(*p) || (p != idstart && isdigit(*p)))
1290 continue;
1291 .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1292 c = '?';
1293 break;
1295 break;
1297 break;
1298 case 0:
1299 case 0x1A:
1300 // end of file
1301 c = '\\';
1302 break;
1303 default:
1304 if (isoctal(cast(char)c))
1306 uint v = 0;
1307 int n = 0;
1310 v = v * 8 + (c - '0');
1311 c = *++p;
1313 while (++n < 3 && isoctal(cast(char)c));
1314 c = v;
1315 if (c > 0xFF)
1316 .error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1318 else
1320 .error(loc, "undefined escape sequence \\%c", c);
1321 p++;
1323 break;
1325 return c;
1329 Lex a wysiwyg string. `p` must be pointing to the first character before the
1330 contents of the string literal. The character pointed to by `p` will be used as
1331 the terminating character (i.e. backtick or double-quote).
1332 Params:
1333 result = pointer to the token that accepts the result
1335 private void wysiwygStringConstant(Token* result)
1337 result.value = TOK.string_;
1338 Loc start = loc();
1339 auto terminator = p[0];
1340 p++;
1341 stringbuffer.setsize(0);
1342 while (1)
1344 dchar c = p[0];
1345 p++;
1346 switch (c)
1348 case '\n':
1349 endOfLine();
1350 break;
1351 case '\r':
1352 if (p[0] == '\n')
1353 continue; // ignore
1354 c = '\n'; // treat EndOfLine as \n character
1355 endOfLine();
1356 break;
1357 case 0:
1358 case 0x1A:
1359 error("unterminated string constant starting at %s", start.toChars());
1360 result.setString();
1361 // rewind `p` so it points to the EOF character
1362 p--;
1363 return;
1364 default:
1365 if (c == terminator)
1367 result.setString(stringbuffer);
1368 stringPostfix(result);
1369 return;
1371 else if (c & 0x80)
1373 p--;
1374 const u = decodeUTF();
1375 p++;
1376 if (u == PS || u == LS)
1377 endOfLine();
1378 stringbuffer.writeUTF8(u);
1379 continue;
1381 break;
1383 stringbuffer.writeByte(c);
1388 Lex a delimited string. Some examples of delimited strings are:
1390 q"(foo(xxx))" // "foo(xxx)"
1391 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1392 q"/foo]/" // "foo]"
1393 q"HERE
1395 HERE" // "foo\n"
1397 It is assumed that `p` points to the opening double-quote '"'.
1398 Params:
1399 result = pointer to the token that accepts the result
1401 private void delimitedStringConstant(Token* result)
1403 result.value = TOK.string_;
1404 Loc start = loc();
1405 dchar delimleft = 0;
1406 dchar delimright = 0;
1407 uint nest = 1;
1408 uint nestcount = ~0; // dead assignment, needed to suppress warning
1409 Identifier hereid = null;
1410 uint blankrol = 0;
1411 uint startline = 0;
1412 p++;
1413 stringbuffer.setsize(0);
1414 while (1)
1416 dchar c = *p++;
1417 //printf("c = '%c'\n", c);
1418 switch (c)
1420 case '\n':
1421 Lnextline:
1422 endOfLine();
1423 startline = 1;
1424 if (blankrol)
1426 blankrol = 0;
1427 continue;
1429 if (hereid)
1431 stringbuffer.writeUTF8(c);
1432 continue;
1434 break;
1435 case '\r':
1436 if (*p == '\n')
1437 continue; // ignore
1438 c = '\n'; // treat EndOfLine as \n character
1439 goto Lnextline;
1440 case 0:
1441 case 0x1A:
1442 error("unterminated delimited string constant starting at %s", start.toChars());
1443 result.setString();
1444 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1445 p--;
1446 return;
1447 default:
1448 if (c & 0x80)
1450 p--;
1451 c = decodeUTF();
1452 p++;
1453 if (c == PS || c == LS)
1454 goto Lnextline;
1456 break;
1458 if (delimleft == 0)
1460 delimleft = c;
1461 nest = 1;
1462 nestcount = 1;
1463 if (c == '(')
1464 delimright = ')';
1465 else if (c == '{')
1466 delimright = '}';
1467 else if (c == '[')
1468 delimright = ']';
1469 else if (c == '<')
1470 delimright = '>';
1471 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1473 // Start of identifier; must be a heredoc
1474 Token tok;
1475 p--;
1476 scan(&tok); // read in heredoc identifier
1477 if (tok.value != TOK.identifier)
1479 error("identifier expected for heredoc, not %s", tok.toChars());
1480 delimright = c;
1482 else
1484 hereid = tok.ident;
1485 //printf("hereid = '%s'\n", hereid.toChars());
1486 blankrol = 1;
1488 nest = 0;
1490 else
1492 delimright = c;
1493 nest = 0;
1494 if (isspace(c))
1495 error("delimiter cannot be whitespace");
1498 else
1500 if (blankrol)
1502 error("heredoc rest of line should be blank");
1503 blankrol = 0;
1504 continue;
1506 if (nest == 1)
1508 if (c == delimleft)
1509 nestcount++;
1510 else if (c == delimright)
1512 nestcount--;
1513 if (nestcount == 0)
1514 goto Ldone;
1517 else if (c == delimright)
1518 goto Ldone;
1519 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1521 Token tok;
1522 auto psave = p;
1523 p--;
1524 scan(&tok); // read in possible heredoc identifier
1525 //printf("endid = '%s'\n", tok.ident.toChars());
1526 if (tok.value == TOK.identifier && tok.ident is hereid)
1528 /* should check that rest of line is blank
1530 goto Ldone;
1532 p = psave;
1534 stringbuffer.writeUTF8(c);
1535 startline = 0;
1538 Ldone:
1539 if (*p == '"')
1540 p++;
1541 else if (hereid)
1542 error("delimited string must end in `%s\"`", hereid.toChars());
1543 else if (isspace(delimright))
1544 error("delimited string must end in `\"`");
1545 else
1546 error("delimited string must end in `%c\"`", delimright);
1547 result.setString(stringbuffer);
1548 stringPostfix(result);
1552 Lex a token string. Some examples of token strings are:
1554 q{ foo(xxx) } // " foo(xxx) "
1555 q{foo$(LPAREN)} // "foo$(LPAREN)"
1556 q{{foo}"}"} // "{foo}"}""
1558 It is assumed that `p` points to the opening curly-brace.
1559 Params:
1560 result = pointer to the token that accepts the result
1562 private void tokenStringConstant(Token* result)
1564 result.value = TOK.string_;
1566 uint nest = 1;
1567 const start = loc();
1568 const pstart = ++p;
1569 inTokenStringConstant++;
1570 scope(exit) inTokenStringConstant--;
1571 while (1)
1573 Token tok;
1574 scan(&tok);
1575 switch (tok.value)
1577 case TOK.leftCurly:
1578 nest++;
1579 continue;
1580 case TOK.rightCurly:
1581 if (--nest == 0)
1583 result.setString(pstart, p - 1 - pstart);
1584 stringPostfix(result);
1585 return;
1587 continue;
1588 case TOK.endOfFile:
1589 error("unterminated token string constant starting at %s", start.toChars());
1590 result.setString();
1591 return;
1592 default:
1593 continue;
1599 Scan a quoted string while building the processed string value by
1600 handling escape sequences. The result is returned in the given `t` token.
1601 This function assumes that `p` currently points to the opening quote
1602 of the string.
1603 Params:
1604 t = the token to set the resulting string to
1605 * References:
1606 * D https://dlang.org/spec/lex.html#double_quoted_strings
1607 * ImportC C11 6.4.5
1609 private void escapeStringConstant(Token* t)
1611 t.value = TOK.string_;
1613 const start = loc();
1614 const tc = *p++; // opening quote
1615 stringbuffer.setsize(0);
1616 while (1)
1618 dchar c = *p++;
1619 switch (c)
1621 case '\\':
1622 switch (*p)
1624 case '&':
1625 if (Ccompile)
1626 goto default;
1627 goto case;
1629 case 'u':
1630 case 'U':
1631 c = escapeSequence();
1632 stringbuffer.writeUTF8(c);
1633 continue;
1634 default:
1635 c = escapeSequence();
1636 break;
1638 break;
1639 case '\n':
1640 endOfLine();
1641 if (Ccompile)
1642 goto Lunterminated;
1643 break;
1644 case '\r':
1645 if (*p == '\n')
1646 continue; // ignore
1647 c = '\n'; // treat EndOfLine as \n character
1648 endOfLine();
1649 if (Ccompile)
1650 goto Lunterminated;
1651 break;
1652 case '\'':
1653 case '"':
1654 if (c != tc)
1655 goto default;
1656 t.setString(stringbuffer);
1657 if (!Ccompile)
1658 stringPostfix(t);
1659 return;
1660 case 0:
1661 case 0x1A:
1662 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1663 p--;
1664 Lunterminated:
1665 error("unterminated string constant starting at %s", start.toChars());
1666 t.setString();
1667 return;
1668 default:
1669 if (c & 0x80)
1671 p--;
1672 c = decodeUTF();
1673 if (c == LS || c == PS)
1675 c = '\n';
1676 endOfLine();
1677 if (Ccompile)
1678 goto Lunterminated;
1680 p++;
1681 stringbuffer.writeUTF8(c);
1682 continue;
1684 break;
1686 stringbuffer.writeByte(c);
1690 /**************************************
1691 * Reference:
1692 * https://dlang.org/spec/lex.html#characterliteral
1694 private TOK charConstant(Token* t)
1696 TOK tk = TOK.charLiteral;
1697 //printf("Lexer::charConstant\n");
1698 p++;
1699 dchar c = *p++;
1700 switch (c)
1702 case '\\':
1703 switch (*p)
1705 case 'u':
1706 t.unsvalue = escapeSequence();
1707 tk = TOK.wcharLiteral;
1708 break;
1709 case 'U':
1710 case '&':
1711 t.unsvalue = escapeSequence();
1712 tk = TOK.dcharLiteral;
1713 break;
1714 default:
1715 t.unsvalue = escapeSequence();
1716 break;
1718 break;
1719 case '\n':
1721 endOfLine();
1722 goto case;
1723 case '\r':
1724 goto case '\'';
1725 case 0:
1726 case 0x1A:
1727 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1728 p--;
1729 goto case;
1730 case '\'':
1731 error("unterminated character constant");
1732 t.unsvalue = '?';
1733 return tk;
1734 default:
1735 if (c & 0x80)
1737 p--;
1738 c = decodeUTF();
1739 p++;
1740 if (c == LS || c == PS)
1741 goto L1;
1742 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1743 tk = TOK.wcharLiteral;
1744 else
1745 tk = TOK.dcharLiteral;
1747 t.unsvalue = c;
1748 break;
1750 if (*p != '\'')
1752 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1753 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1755 if (*p & 0x80)
1757 const s = p;
1758 c = decodeUTF();
1759 if (c == LS || c == PS)
1761 p = s;
1762 break;
1765 p++;
1768 if (*p == '\'')
1770 error("character constant has multiple characters");
1771 p++;
1773 else
1774 error("unterminated character constant");
1775 t.unsvalue = '?';
1776 return tk;
1778 p++;
1779 return tk;
1782 /***************************************
1783 * Lex C character constant.
1784 * Parser is on the opening quote.
1785 * Params:
1786 * t = token to fill in
1787 * prefix = one of `u`, `U` or 0.
1788 * Reference:
1789 * C11 6.4.4.4
1791 private void clexerCharConstant(ref Token t, char prefix)
1793 escapeStringConstant(&t);
1794 const(char)[] str = t.ustring[0 .. t.len];
1795 const n = str.length;
1796 const loc = t.loc;
1797 if (n == 0)
1799 error(loc, "empty character constant");
1800 t.value = TOK.semicolon;
1801 return;
1804 uint u;
1805 switch (prefix)
1807 case 0:
1808 if (n == 1) // fast case
1810 u = str[0];
1812 else if (n > 4)
1813 error(loc, "max number of chars in character literal is 4, had %d",
1814 cast(int)n);
1815 else
1817 foreach (i, c; str)
1818 (cast(char*)&u)[n - 1 - i] = c;
1820 break;
1822 case 'u':
1823 dchar d1;
1824 size_t idx;
1825 auto msg = utf_decodeChar(str, idx, d1);
1826 dchar d2 = 0;
1827 if (idx < n && !msg)
1828 msg = utf_decodeChar(str, idx, d2);
1829 if (msg)
1830 error(loc, "%s", msg);
1831 else if (idx < n)
1832 error(loc, "max number of chars in 16 bit character literal is 2, had %d",
1833 (n + 1) >> 1);
1834 else if (d1 > 0x1_0000)
1835 error(loc, "%d does not fit in 16 bits", d1);
1836 else if (d2 > 0x1_0000)
1837 error(loc, "%d does not fit in 16 bits", d2);
1838 u = d1;
1839 if (d2)
1840 u = (d1 << 16) | d2;
1841 break;
1843 case 'U':
1844 dchar d;
1845 size_t idx;
1846 auto msg = utf_decodeChar(str, idx, d);
1847 if (msg)
1848 error(loc, "%s", msg);
1849 else if (idx < n)
1850 error(loc, "max number of chars in 32 bit character literal is 1, had %d",
1851 (n + 3) >> 2);
1852 u = d;
1853 break;
1855 default:
1856 assert(0);
1858 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
1859 t.unsvalue = u;
1862 /***************************************
1863 * Get postfix of string literal.
1865 private void stringPostfix(Token* t) pure @nogc
1867 switch (*p)
1869 case 'c':
1870 case 'w':
1871 case 'd':
1872 t.postfix = *p;
1873 p++;
1874 break;
1875 default:
1876 t.postfix = 0;
1877 break;
1881 /**************************************
1882 * Read in a number.
1883 * If it's an integer, store it in tok.TKutok.Vlong.
1884 * integers can be decimal, octal or hex
1885 * Handle the suffixes U, UL, LU, L, etc.
1886 * If it's double, store it in tok.TKutok.Vdouble.
1887 * Returns:
1888 * TKnum
1889 * TKdouble,...
1891 private TOK number(Token* t)
1893 int base = 10;
1894 const start = p;
1895 uinteger_t n = 0; // unsigned >=64 bit integer type
1896 int d;
1897 bool err = false;
1898 bool overflow = false;
1899 bool anyBinaryDigitsNoSingleUS = false;
1900 bool anyHexDigitsNoSingleUS = false;
1901 char errorDigit = 0;
1902 dchar c = *p;
1903 if (c == '0')
1905 ++p;
1906 c = *p;
1907 switch (c)
1909 case '0':
1910 case '1':
1911 case '2':
1912 case '3':
1913 case '4':
1914 case '5':
1915 case '6':
1916 case '7':
1917 base = 8;
1918 break;
1920 case '8':
1921 case '9':
1922 errorDigit = cast(char) c;
1923 base = 8;
1924 break;
1925 case 'x':
1926 case 'X':
1927 ++p;
1928 base = 16;
1929 break;
1930 case 'b':
1931 case 'B':
1932 if (Ccompile)
1933 error("binary constants not allowed");
1934 ++p;
1935 base = 2;
1936 break;
1937 case '.':
1938 if (p[1] == '.')
1939 goto Ldone; // if ".."
1940 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1942 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
1943 goto Lreal; // if `0.f` or `0.L`
1944 goto Ldone; // if ".identifier" or ".unicode"
1946 goto Lreal; // '.' is part of current token
1947 case 'i':
1948 case 'f':
1949 case 'F':
1950 goto Lreal;
1951 case '_':
1952 if (Ccompile)
1953 error("embedded `_` not allowed");
1954 ++p;
1955 base = 8;
1956 break;
1957 case 'L':
1958 if (p[1] == 'i')
1959 goto Lreal;
1960 break;
1961 default:
1962 break;
1965 while (1)
1967 c = *p;
1968 switch (c)
1970 case '0':
1971 case '1':
1972 case '2':
1973 case '3':
1974 case '4':
1975 case '5':
1976 case '6':
1977 case '7':
1978 case '8':
1979 case '9':
1980 ++p;
1981 d = c - '0';
1982 break;
1983 case 'a':
1984 case 'b':
1985 case 'c':
1986 case 'd':
1987 case 'e':
1988 case 'f':
1989 case 'A':
1990 case 'B':
1991 case 'C':
1992 case 'D':
1993 case 'E':
1994 case 'F':
1995 ++p;
1996 if (base != 16)
1998 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
1999 goto Lreal;
2001 if (c >= 'a')
2002 d = c + 10 - 'a';
2003 else
2004 d = c + 10 - 'A';
2005 break;
2006 case 'L':
2007 if (p[1] == 'i')
2008 goto Lreal;
2009 goto Ldone;
2010 case '.':
2011 if (p[1] == '.')
2012 goto Ldone; // if ".."
2013 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2015 if (Ccompile && base == 10 &&
2016 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2017 goto Lreal; // if `1.e6` or `1.f` or `1.L`
2018 goto Ldone; // if ".identifier" or ".unicode"
2020 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2021 goto Ldone; // if ".identifier" or ".unicode"
2022 if (base == 2)
2023 goto Ldone; // if ".identifier" or ".unicode"
2024 goto Lreal; // otherwise as part of a floating point literal
2025 case 'p':
2026 case 'P':
2027 case 'i':
2028 Lreal:
2029 p = start;
2030 return inreal(t);
2031 case '_':
2032 if (Ccompile)
2033 goto default;
2034 ++p;
2035 continue;
2036 default:
2037 goto Ldone;
2039 // got a digit here, set any necessary flags, check for errors
2040 anyHexDigitsNoSingleUS = true;
2041 anyBinaryDigitsNoSingleUS = true;
2042 if (!errorDigit && d >= base)
2044 errorDigit = cast(char) c;
2046 // Avoid expensive overflow check if we aren't at risk of overflow
2047 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2048 n = n * base + d;
2049 else
2051 import core.checkedint : mulu, addu;
2053 n = mulu(n, base, overflow);
2054 n = addu(n, d, overflow);
2057 Ldone:
2058 if (errorDigit)
2060 error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2061 base == 8 ? "octal".ptr :
2062 "decimal".ptr, errorDigit);
2063 err = true;
2065 if (overflow && !err)
2067 error("integer overflow");
2068 err = true;
2070 if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2071 (base == 16 && !anyHexDigitsNoSingleUS))
2072 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2074 t.unsvalue = n;
2076 if (Ccompile)
2077 return cnumber(base, n);
2079 enum FLAGS : int
2081 none = 0,
2082 decimal = 1, // decimal
2083 unsigned = 2, // u or U suffix
2084 long_ = 4, // L suffix
2087 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2088 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2089 const psuffix = p;
2090 while (1)
2092 FLAGS f;
2093 switch (*p)
2095 case 'U':
2096 case 'u':
2097 f = FLAGS.unsigned;
2098 goto L1;
2099 case 'l':
2100 f = FLAGS.long_;
2101 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2102 goto L1;
2103 case 'L':
2104 f = FLAGS.long_;
2106 p++;
2107 if ((flags & f) && !err)
2109 error("unrecognized token");
2110 err = true;
2112 flags = cast(FLAGS)(flags | f);
2113 continue;
2114 default:
2115 break;
2117 break;
2119 if (base == 8 && n >= 8)
2121 if (err)
2122 // can't translate invalid octal value, just show a generic message
2123 error("octal literals larger than 7 are no longer supported");
2124 else
2125 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2126 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2128 TOK result;
2129 switch (flags)
2131 case FLAGS.none:
2132 /* Octal or Hexadecimal constant.
2133 * First that fits: int, uint, long, ulong
2135 if (n & 0x8000000000000000L)
2136 result = TOK.uns64Literal;
2137 else if (n & 0xFFFFFFFF00000000L)
2138 result = TOK.int64Literal;
2139 else if (n & 0x80000000)
2140 result = TOK.uns32Literal;
2141 else
2142 result = TOK.int32Literal;
2143 break;
2144 case FLAGS.decimal:
2145 /* First that fits: int, long, long long
2147 if (n & 0x8000000000000000L)
2149 result = TOK.uns64Literal;
2151 else if (n & 0xFFFFFFFF80000000L)
2152 result = TOK.int64Literal;
2153 else
2154 result = TOK.int32Literal;
2155 break;
2156 case FLAGS.unsigned:
2157 case FLAGS.decimal | FLAGS.unsigned:
2158 /* First that fits: uint, ulong
2160 if (n & 0xFFFFFFFF00000000L)
2161 result = TOK.uns64Literal;
2162 else
2163 result = TOK.uns32Literal;
2164 break;
2165 case FLAGS.decimal | FLAGS.long_:
2166 if (n & 0x8000000000000000L)
2168 if (!err)
2170 error("signed integer overflow");
2171 err = true;
2173 result = TOK.uns64Literal;
2175 else
2176 result = TOK.int64Literal;
2177 break;
2178 case FLAGS.long_:
2179 if (n & 0x8000000000000000L)
2180 result = TOK.uns64Literal;
2181 else
2182 result = TOK.int64Literal;
2183 break;
2184 case FLAGS.unsigned | FLAGS.long_:
2185 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2186 result = TOK.uns64Literal;
2187 break;
2188 default:
2189 debug
2191 printf("%x\n", flags);
2193 assert(0);
2195 return result;
2198 /**************************************
2199 * Lex C integer-suffix
2200 * Params:
2201 * base = number base
2202 * n = raw integer value
2203 * Returns:
2204 * token value
2206 private TOK cnumber(int base, uinteger_t n)
2208 /* C11 6.4.4.1
2209 * Parse trailing suffixes:
2210 * u or U
2211 * l or L
2212 * ll or LL
2214 enum FLAGS : uint
2216 octalhex = 1, // octal or hexadecimal
2217 decimal = 2, // decimal
2218 unsigned = 4, // u or U suffix
2219 long_ = 8, // l or L suffix
2220 llong = 0x10 // ll or LL
2222 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2223 bool err;
2224 Lsuffixes:
2225 while (1)
2227 FLAGS f;
2228 const cs = *p;
2229 switch (cs)
2231 case 'U':
2232 case 'u':
2233 f = FLAGS.unsigned;
2234 break;
2236 case 'l':
2237 case 'L':
2238 f = FLAGS.long_;
2239 if (cs == p[1])
2241 f = FLAGS.long_ | FLAGS.llong;
2242 ++p;
2244 break;
2246 default:
2247 break Lsuffixes;
2249 ++p;
2250 if ((flags & f) && !err)
2252 error("duplicate integer suffixes");
2253 err = true;
2255 flags = cast(FLAGS)(flags | f);
2258 TOK result = TOK.int32Literal; // default
2259 switch (flags)
2261 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2262 * this code deviates from C by picking D int, uint, long, or ulong instead
2265 case FLAGS.octalhex:
2266 /* Octal or Hexadecimal constant.
2267 * First that fits: int, unsigned, long, unsigned long,
2268 * long long, unsigned long long
2270 if (n & 0x8000000000000000L)
2271 result = TOK.uns64Literal; // unsigned long
2272 else if (n & 0xFFFFFFFF00000000L)
2273 result = TOK.int64Literal; // long
2274 else if (n & 0x80000000)
2275 result = TOK.uns32Literal;
2276 else
2277 result = TOK.int32Literal;
2278 break;
2280 case FLAGS.decimal:
2281 /* First that fits: int, long, long long
2283 if (n & 0x8000000000000000L)
2284 result = TOK.uns64Literal; // unsigned long
2285 else if (n & 0xFFFFFFFF80000000L)
2286 result = TOK.int64Literal; // long
2287 else
2288 result = TOK.int32Literal;
2289 break;
2291 case FLAGS.octalhex | FLAGS.unsigned:
2292 case FLAGS.decimal | FLAGS.unsigned:
2293 /* First that fits: unsigned, unsigned long, unsigned long long
2295 if (n & 0xFFFFFFFF00000000L)
2296 result = TOK.uns64Literal; // unsigned long
2297 else
2298 result = TOK.uns32Literal;
2299 break;
2301 case FLAGS.decimal | FLAGS.long_:
2302 /* First that fits: long, long long
2304 if (longsize == 4 || long_longsize == 4)
2306 if (n & 0xFFFFFFFF_80000000L)
2307 result = TOK.int64Literal;
2308 else
2309 result = TOK.int32Literal; // long
2311 else
2313 result = TOK.int64Literal; // long
2315 break;
2317 case FLAGS.octalhex | FLAGS.long_:
2318 /* First that fits: long, unsigned long, long long,
2319 * unsigned long long
2321 if (longsize == 4 || long_longsize == 4)
2323 if (n & 0x8000000000000000L)
2324 result = TOK.uns64Literal;
2325 else if (n & 0xFFFFFFFF00000000L)
2326 result = TOK.int64Literal;
2327 else if (n & 0x80000000)
2328 result = TOK.uns32Literal; // unsigned long
2329 else
2330 result = TOK.int32Literal; // long
2332 else
2334 if (n & 0x80000000_00000000L)
2335 result = TOK.uns64Literal; // unsigned long
2336 else
2337 result = TOK.int64Literal; // long
2339 break;
2341 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2342 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2343 /* First that fits: unsigned long, unsigned long long
2345 if (longsize == 4 || long_longsize == 4)
2347 if (n & 0xFFFFFFFF00000000L)
2348 result = TOK.uns64Literal;
2349 else
2350 result = TOK.uns32Literal; // unsigned long
2352 else
2354 result = TOK.uns64Literal; // unsigned long
2356 break;
2358 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2359 /* First that fits: long long, unsigned long long
2361 if (n & 0x8000000000000000L)
2362 result = TOK.uns64Literal;
2363 else
2364 result = TOK.int64Literal;
2365 break;
2367 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2368 /* long long
2370 result = TOK.int64Literal;
2371 break;
2373 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2374 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2375 result = TOK.uns64Literal;
2376 break;
2378 default:
2379 debug printf("%x\n",flags);
2380 assert(0);
2382 return result;
2385 /**************************************
2386 * Read in characters, converting them to real.
2387 * Bugs:
2388 * Exponent overflow not detected.
2389 * Too much requested precision is not detected.
2391 private TOK inreal(Token* t)
2393 //printf("Lexer::inreal()\n");
2394 debug
2396 assert(*p == '.' || isdigit(*p));
2398 bool isWellformedString = true;
2399 stringbuffer.setsize(0);
2400 auto pstart = p;
2401 bool hex = false;
2402 dchar c = *p++;
2403 // Leading '0x'
2404 if (c == '0')
2406 c = *p++;
2407 if (c == 'x' || c == 'X')
2409 hex = true;
2410 c = *p++;
2413 // Digits to left of '.'
2414 while (1)
2416 if (c == '.')
2418 c = *p++;
2419 break;
2421 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2423 c = *p++;
2424 continue;
2426 break;
2428 // Digits to right of '.'
2429 while (1)
2431 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2433 c = *p++;
2434 continue;
2436 break;
2438 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2440 c = *p++;
2441 if (c == '-' || c == '+')
2443 c = *p++;
2445 bool anyexp = false;
2446 while (1)
2448 if (isdigit(c))
2450 anyexp = true;
2451 c = *p++;
2452 continue;
2454 if (c == '_')
2456 if (Ccompile)
2457 error("embedded `_` in numeric literals not allowed");
2458 c = *p++;
2459 continue;
2461 if (!anyexp)
2463 error("missing exponent");
2464 isWellformedString = false;
2466 break;
2469 else if (hex)
2471 error("exponent required for hex float");
2472 isWellformedString = false;
2474 --p;
2475 while (pstart < p)
2477 if (*pstart != '_')
2478 stringbuffer.writeByte(*pstart);
2479 ++pstart;
2481 stringbuffer.writeByte(0);
2482 auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2483 TOK result;
2484 bool isOutOfRange = false;
2485 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero);
2486 switch (*p)
2488 case 'F':
2489 case 'f':
2490 if (isWellformedString && !isOutOfRange)
2491 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2492 result = TOK.float32Literal;
2493 p++;
2494 break;
2495 default:
2496 if (isWellformedString && !isOutOfRange)
2497 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2498 result = TOK.float64Literal;
2499 break;
2500 case 'l':
2501 if (!Ccompile)
2502 error("use 'L' suffix instead of 'l'");
2503 goto case 'L';
2504 case 'L':
2505 ++p;
2506 if (Ccompile && long_doublesize == 8)
2507 goto default;
2508 result = TOK.float80Literal;
2509 break;
2511 if ((*p == 'i' || *p == 'I') && !Ccompile)
2513 if (*p == 'I')
2514 error("use 'i' suffix instead of 'I'");
2515 p++;
2516 switch (result)
2518 case TOK.float32Literal:
2519 result = TOK.imaginary32Literal;
2520 break;
2521 case TOK.float64Literal:
2522 result = TOK.imaginary64Literal;
2523 break;
2524 case TOK.float80Literal:
2525 result = TOK.imaginary80Literal;
2526 break;
2527 default:
2528 break;
2531 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2532 if (isOutOfRange && !isLong && (!Ccompile || hex))
2534 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2536 const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : "";
2537 error(scanloc, "number `%s%s` is not representable", sbufptr, suffix);
2539 debug
2541 switch (result)
2543 case TOK.float32Literal:
2544 case TOK.float64Literal:
2545 case TOK.float80Literal:
2546 case TOK.imaginary32Literal:
2547 case TOK.imaginary64Literal:
2548 case TOK.imaginary80Literal:
2549 break;
2550 default:
2551 assert(0);
2554 return result;
2557 final Loc loc() pure @nogc
2559 scanloc.charnum = cast(uint)(1 + p - line);
2560 version (LocOffset)
2561 scanloc.fileOffset = cast(uint)(p - base);
2562 return scanloc;
2565 final void error(const(char)* format, ...)
2567 va_list args;
2568 va_start(args, format);
2569 .verror(token.loc, format, args);
2570 va_end(args);
2573 final void error(const ref Loc loc, const(char)* format, ...)
2575 va_list args;
2576 va_start(args, format);
2577 .verror(loc, format, args);
2578 va_end(args);
2581 final void deprecation(const(char)* format, ...)
2583 va_list args;
2584 va_start(args, format);
2585 .vdeprecation(token.loc, format, args);
2586 va_end(args);
2589 /***************************************
2590 * Parse special token sequence:
2591 * Returns:
2592 * true if the special token sequence was handled
2593 * References:
2594 * https://dlang.org/spec/lex.html#special-token-sequence
2596 bool parseSpecialTokenSequence()
2598 Token n;
2599 scan(&n);
2600 if (n.value == TOK.identifier)
2602 if (n.ident == Id.line)
2604 poundLine(n, false);
2605 return true;
2607 else
2609 const locx = loc();
2610 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2613 else if (n.value == TOK.if_)
2615 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2617 return false;
2620 /*********************************************
2621 * Parse line/file preprocessor directive:
2622 * #line linnum [filespec]
2623 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2624 * Accept linemarker format:
2625 * # linnum [filespec] {flags}
2626 * There can be zero or more flags, which are one of the digits 1..4, and
2627 * must be in ascending order. The flags are ignored.
2628 * Params:
2629 * tok = token we're on, which is linnum of linemarker
2630 * linemarker = true if line marker format and lexer is on linnum
2631 * References:
2632 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2634 final void poundLine(ref Token tok, bool linemarker)
2636 auto linnum = this.scanloc.linnum;
2637 const(char)* filespec = null;
2638 bool flags;
2640 if (!linemarker)
2641 scan(&tok);
2642 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2644 const lin = cast(int)(tok.unsvalue);
2645 if (lin != tok.unsvalue)
2647 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2648 skipToNextLine();
2649 return;
2651 else
2652 linnum = lin;
2654 else if (tok.value == TOK.line) // #line __LINE__
2657 else
2659 error(tok.loc, "positive integer argument expected following `#line`");
2660 if (tok.value != TOK.endOfLine)
2661 skipToNextLine();
2662 return;
2664 while (1)
2666 scan(&tok);
2667 switch (tok.value)
2669 case TOK.endOfFile:
2670 case TOK.endOfLine:
2671 if (!inTokenStringConstant)
2673 this.scanloc.linnum = linnum;
2674 if (filespec)
2675 this.scanloc.filename = filespec;
2677 return;
2678 case TOK.file:
2679 if (filespec || flags)
2680 goto Lerr;
2681 filespec = mem.xstrdup(scanloc.filename);
2682 continue;
2683 case TOK.string_:
2684 if (filespec || flags)
2685 goto Lerr;
2686 if (tok.ptr[0] != '"' || tok.postfix != 0)
2687 goto Lerr;
2688 filespec = tok.ustring;
2689 continue;
2690 case TOK.int32Literal:
2691 if (!filespec)
2692 goto Lerr;
2693 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2695 flags = true; // linemarker flags seen
2696 continue;
2698 goto Lerr;
2699 default:
2700 goto Lerr;
2703 Lerr:
2704 if (filespec is null)
2705 error(tok.loc, "invalid filename for `#line` directive");
2706 else if (linemarker)
2707 error(tok.loc, "invalid flag for line marker directive");
2708 else if (!Ccompile)
2709 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2710 if (tok.value != TOK.endOfLine)
2711 skipToNextLine();
2714 /***************************************
2715 * Scan forward to start of next line.
2717 final void skipToNextLine()
2719 while (1)
2721 switch (*p)
2723 case 0:
2724 case 0x1A:
2725 return; // do not advance p
2727 case '\n':
2728 ++p;
2729 break;
2731 case '\r':
2732 ++p;
2733 if (p[0] == '\n')
2734 ++p;
2735 break;
2737 default:
2738 if (*p & 0x80)
2740 const u = decodeUTF();
2741 if (u == PS || u == LS)
2743 ++p;
2744 break;
2747 ++p;
2748 continue;
2750 break;
2752 endOfLine();
2753 tokenizeNewlines = false;
2756 /********************************************
2757 * Decode UTF character.
2758 * Issue error messages for invalid sequences.
2759 * Return decoded character, advance p to last character in UTF sequence.
2761 private uint decodeUTF()
2763 const s = p;
2764 assert(*s & 0x80);
2765 // Check length of remaining string up to 4 UTF-8 characters
2766 size_t len;
2767 for (len = 1; len < 4 && s[len]; len++)
2770 size_t idx = 0;
2771 dchar u;
2772 const msg = utf_decodeChar(s[0 .. len], idx, u);
2773 p += idx - 1;
2774 if (msg)
2776 error("%.*s", cast(int)msg.length, msg.ptr);
2778 return u;
2781 /***************************************************
2782 * Parse doc comment embedded between t.ptr and p.
2783 * Remove trailing blanks and tabs from lines.
2784 * Replace all newlines with \n.
2785 * Remove leading comment character from each line.
2786 * Decide if it's a lineComment or a blockComment.
2787 * Append to previous one for this token.
2789 * If newParagraph is true, an extra newline will be
2790 * added between adjoining doc comments.
2792 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2794 /* ct tells us which kind of comment it is: '/', '*', or '+'
2796 const ct = t.ptr[2];
2797 /* Start of comment text skips over / * *, / + +, or / / /
2799 const(char)* q = t.ptr + 3; // start of comment text
2800 const(char)* qend = p;
2801 if (ct == '*' || ct == '+')
2802 qend -= 2;
2803 /* Scan over initial row of ****'s or ++++'s or ////'s
2805 for (; q < qend; q++)
2807 if (*q != ct)
2808 break;
2810 /* Remove leading spaces until start of the comment
2812 int linestart = 0;
2813 if (ct == '/')
2815 while (q < qend && (*q == ' ' || *q == '\t'))
2816 ++q;
2818 else if (q < qend)
2820 if (*q == '\r')
2822 ++q;
2823 if (q < qend && *q == '\n')
2824 ++q;
2825 linestart = 1;
2827 else if (*q == '\n')
2829 ++q;
2830 linestart = 1;
2833 /* Remove trailing row of ****'s or ++++'s
2835 if (ct != '/')
2837 for (; q < qend; qend--)
2839 if (qend[-1] != ct)
2840 break;
2843 /* Comment is now [q .. qend].
2844 * Canonicalize it into buf[].
2846 OutBuffer buf;
2848 void trimTrailingWhitespace()
2850 const s = buf[];
2851 auto len = s.length;
2852 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2853 --len;
2854 buf.setsize(len);
2857 for (; q < qend; q++)
2859 char c = *q;
2860 switch (c)
2862 case '*':
2863 case '+':
2864 if (linestart && c == ct)
2866 linestart = 0;
2867 /* Trim preceding whitespace up to preceding \n
2869 trimTrailingWhitespace();
2870 continue;
2872 break;
2873 case ' ':
2874 case '\t':
2875 break;
2876 case '\r':
2877 if (q[1] == '\n')
2878 continue; // skip the \r
2879 goto Lnewline;
2880 default:
2881 if (c == 226)
2883 // If LS or PS
2884 if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2886 q += 2;
2887 goto Lnewline;
2890 linestart = 0;
2891 break;
2892 Lnewline:
2893 c = '\n'; // replace all newlines with \n
2894 goto case;
2895 case '\n':
2896 linestart = 1;
2897 /* Trim trailing whitespace
2899 trimTrailingWhitespace();
2900 break;
2902 buf.writeByte(c);
2904 /* Trim trailing whitespace (if the last line does not have newline)
2906 trimTrailingWhitespace();
2908 // Always end with a newline
2909 const s = buf[];
2910 if (s.length == 0 || s[$ - 1] != '\n')
2911 buf.writeByte('\n');
2913 // It's a line comment if the start of the doc comment comes
2914 // after other non-whitespace on the same line.
2915 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2916 // Combine with previous doc comment, if any
2917 if (*dc)
2918 *dc = combineComments(*dc, buf[], newParagraph).toDString();
2919 else
2920 *dc = buf.extractSlice(true);
2923 /********************************************
2924 * Combine two document comments into one,
2925 * separated by an extra newline if newParagraph is true.
2927 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
2929 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
2930 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
2931 if (!c1)
2932 return c2.ptr;
2933 if (!c2)
2934 return c1.ptr;
2936 int insertNewLine = 0;
2937 if (c1.length && c1[$ - 1] != '\n')
2938 insertNewLine = 1;
2939 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
2940 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
2941 p[0 .. c1.length] = c1[];
2942 if (insertNewLine)
2943 p[c1.length] = '\n';
2944 if (newParagraph)
2945 p[c1.length + insertNewLine] = '\n';
2946 p[retSize - c2.length .. retSize] = c2[];
2947 p[retSize] = 0;
2948 return p;
2951 /**************************
2952 * `p` should be at start of next line
2954 private void endOfLine() pure @nogc @safe
2956 scanloc.linnum++;
2957 line = p;
2962 /******************************* Private *****************************************/
2964 private:
2966 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
2967 private struct TimeStampInfo
2969 private __gshared bool initdone = false;
2971 // Note: Those properties need to be guarded by a call to `init`
2972 // The API isn't safe, and quite brittle, but it was left this way
2973 // over performance concerns.
2974 // This is currently only called once, from the lexer.
2975 __gshared char[11 + 1] date;
2976 __gshared char[8 + 1] time;
2977 __gshared char[24 + 1] timestamp;
2979 public static void initialize(const ref Loc loc) nothrow
2981 if (initdone)
2982 return;
2984 initdone = true;
2985 time_t ct;
2986 // https://issues.dlang.org/show_bug.cgi?id=20444
2987 if (auto p = getenv("SOURCE_DATE_EPOCH"))
2989 if (!ct.parseDigits(p.toDString()))
2990 error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
2992 else
2993 .time(&ct);
2994 const p = ctime(&ct);
2995 assert(p);
2996 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
2997 sprintf(&time[0], "%.8s", p + 11);
2998 sprintf(&timestamp[0], "%.24s", p);
3002 private enum LS = 0x2028; // UTF line separator
3003 private enum PS = 0x2029; // UTF paragraph separator
3005 /********************************************
3006 * Do our own char maps
3008 private static immutable cmtable = ()
3010 ubyte[256] table;
3011 foreach (const c; 0 .. table.length)
3013 if ('0' <= c && c <= '7')
3014 table[c] |= CMoctal;
3015 if (c_isxdigit(c))
3016 table[c] |= CMhex;
3017 if (c_isalnum(c) || c == '_')
3018 table[c] |= CMidchar;
3020 switch (c)
3022 case 'x': case 'X':
3023 case 'b': case 'B':
3024 table[c] |= CMzerosecond;
3025 break;
3027 case '0': .. case '9':
3028 case 'e': case 'E':
3029 case 'f': case 'F':
3030 case 'l': case 'L':
3031 case 'p': case 'P':
3032 case 'u': case 'U':
3033 case 'i':
3034 case '.':
3035 case '_':
3036 table[c] |= CMzerosecond | CMdigitsecond;
3037 break;
3039 default:
3040 break;
3043 switch (c)
3045 case '\\':
3046 case '\n':
3047 case '\r':
3048 case 0:
3049 case 0x1A:
3050 case '\'':
3051 break;
3052 default:
3053 if (!(c & 0x80))
3054 table[c] |= CMsinglechar;
3055 break;
3058 return table;
3059 }();
3061 private
3063 enum CMoctal = 0x1;
3064 enum CMhex = 0x2;
3065 enum CMidchar = 0x4;
3066 enum CMzerosecond = 0x8;
3067 enum CMdigitsecond = 0x10;
3068 enum CMsinglechar = 0x20;
3071 private bool isoctal(const char c) pure @nogc @safe
3073 return (cmtable[c] & CMoctal) != 0;
3076 private bool ishex(const char c) pure @nogc @safe
3078 return (cmtable[c] & CMhex) != 0;
3081 private bool isidchar(const char c) pure @nogc @safe
3083 return (cmtable[c] & CMidchar) != 0;
3086 private bool isZeroSecond(const char c) pure @nogc @safe
3088 return (cmtable[c] & CMzerosecond) != 0;
3091 private bool isDigitSecond(const char c) pure @nogc @safe
3093 return (cmtable[c] & CMdigitsecond) != 0;
3096 private bool issinglechar(const char c) pure @nogc @safe
3098 return (cmtable[c] & CMsinglechar) != 0;
3101 private bool c_isxdigit(const int c) pure @nogc @safe
3103 return (( c >= '0' && c <= '9') ||
3104 ( c >= 'a' && c <= 'f') ||
3105 ( c >= 'A' && c <= 'F'));
3108 private bool c_isalnum(const int c) pure @nogc @safe
3110 return (( c >= '0' && c <= '9') ||
3111 ( c >= 'a' && c <= 'z') ||
3112 ( c >= 'A' && c <= 'Z'));
3115 /******************************* Unittest *****************************************/
3117 unittest
3119 import dmd.console;
3120 nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3121 const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3123 assert(0);
3125 diagnosticHandler = &assertDiagnosticHandler;
3127 static void test(T)(string sequence, T expected, bool Ccompile = false)
3129 auto p = cast(const(char)*)sequence.ptr;
3130 assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile));
3131 assert(p == sequence.ptr + sequence.length);
3134 test(`'`, '\'');
3135 test(`"`, '"');
3136 test(`?`, '?');
3137 test(`\`, '\\');
3138 test(`0`, '\0');
3139 test(`a`, '\a');
3140 test(`b`, '\b');
3141 test(`f`, '\f');
3142 test(`n`, '\n');
3143 test(`r`, '\r');
3144 test(`t`, '\t');
3145 test(`v`, '\v');
3147 test(`x00`, 0x00);
3148 test(`xff`, 0xff);
3149 test(`xFF`, 0xff);
3150 test(`xa7`, 0xa7);
3151 test(`x3c`, 0x3c);
3152 test(`xe2`, 0xe2);
3154 test(`1`, '\1');
3155 test(`42`, '\42');
3156 test(`357`, '\357');
3158 test(`u1234`, '\u1234');
3159 test(`uf0e4`, '\uf0e4');
3161 test(`U0001f603`, '\U0001f603');
3163 test(`&quot;`, '"');
3164 test(`&lt;`, '<');
3165 test(`&gt;`, '>');
3167 diagnosticHandler = null;
3170 unittest
3172 import dmd.console;
3173 string expected;
3174 bool gotError;
3176 nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3177 const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3179 assert(cast(Classification)headerColor == Classification.error);
3181 gotError = true;
3182 char[100] buffer = void;
3183 auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
3184 assert(expected == actual);
3185 return true;
3188 diagnosticHandler = &expectDiagnosticHandler;
3190 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3192 uint errors = global.errors;
3193 gotError = false;
3194 expected = expectedError;
3195 auto p = cast(const(char)*)sequence.ptr;
3196 auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile);
3197 assert(gotError);
3198 assert(expectedReturnValue == actualReturnValue);
3200 auto actualScanLength = p - sequence.ptr;
3201 assert(expectedScanLength == actualScanLength);
3202 global.errors = errors;
3205 test("c", `undefined escape sequence \c`, 'c', 1);
3206 test("!", `undefined escape sequence \!`, '!', 1);
3207 test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3209 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3211 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3212 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3213 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3215 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3216 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3217 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3218 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3219 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3220 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3221 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3223 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3224 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3225 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3227 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3228 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3229 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3231 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3232 test("&quot", `unterminated named entity &quot;`, '?', 5);
3233 test("&quot", `unterminated named entity &quot;`, '?', 5);
3235 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3237 diagnosticHandler = null;
3240 unittest
3242 //printf("lexer.unittest\n");
3243 /* Not much here, just trying things out.
3245 string text = "int"; // We rely on the implicit null-terminator
3246 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
3247 TOK tok;
3248 tok = lex1.nextToken();
3249 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3250 assert(tok == TOK.int32);
3251 tok = lex1.nextToken();
3252 assert(tok == TOK.endOfFile);
3253 tok = lex1.nextToken();
3254 assert(tok == TOK.endOfFile);
3255 tok = lex1.nextToken();
3256 assert(tok == TOK.endOfFile);
3259 unittest
3261 // We don't want to see Lexer error output during these tests.
3262 uint errors = global.startGagging();
3263 scope(exit) global.endGagging(errors);
3265 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3266 static immutable char[][] testcases =
3267 [ // Testcase must end with 0 or 0x1A.
3268 [0], // not malformed, but pathological
3269 ['\'', 0],
3270 ['\'', 0x1A],
3271 ['{', '{', 'q', '{', 0],
3272 [0xFF, 0],
3273 [0xFF, 0x80, 0],
3274 [0xFF, 0xFF, 0],
3275 [0xFF, 0xFF, 0],
3276 ['x', '"', 0x1A],
3279 foreach (testcase; testcases)
3281 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
3282 TOK tok = lex2.nextToken();
3283 size_t iterations = 1;
3284 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3286 tok = lex2.nextToken();
3288 assert(tok == TOK.endOfFile);
3289 tok = lex2.nextToken();
3290 assert(tok == TOK.endOfFile);