// $Id: scanner.cpp,v 1.5 1999/02/17 19:07:57 shields Exp $ copyright notice #include "config.h" #include "scanner.h" #include "control.h" #include "error.h" int (*Scanner::scan_keyword[13]) (wchar_t *p1) = { ScanKeyword0, ScanKeyword0, ScanKeyword2, ScanKeyword3, ScanKeyword4, ScanKeyword5, ScanKeyword6, ScanKeyword7, ScanKeyword8, ScanKeyword9, ScanKeyword10, ScanKeyword0, ScanKeyword12 }; // // The constructor initializes all utility variables. // Scanner::Scanner(Control &control_) : control(control_) { // // If this assertion fails, the Token structure in stream.h must be redesigned !!! // assert(NUM_TERMINALS < 128); // // ------------------------------------------------------------------------------- // We are pulling this code out because we are tired of defending it. We // tought it was obvious that either $ should not have been used for compiler // generated variables or that users should not be allowed to use in variable names... // ------------------------------------------------------------------------------- // // For version 1.1 or above a $ may not be used as part of an identifier name // unless the user specifically requests that it be allowed. // // if (control.option.one_one && (! control.option.dollar)) // Code::SetBadCode(U_DOLLAR); // // // CLASSIFY_TOKEN is a mapping from each character into a // classification routine that is invoked when that character // is the first character encountered in a token. // for (int c = 0; c < 128; c++) { if (Code::IsAlpha(c)) classify_token[c] = &Scanner::ClassifyId; else if (Code::IsDigit(c)) classify_token[c] = &Scanner::ClassifyNumericLiteral; else classify_token[c] = &Scanner::ClassifyBadToken; } classify_token[128] = &Scanner::ClassifyNonAsciiUnicode; classify_token[U_a] = &Scanner::ClassifyIdOrKeyword; classify_token[U_b] = &Scanner::ClassifyIdOrKeyword; classify_token[U_c] = &Scanner::ClassifyIdOrKeyword; classify_token[U_d] = &Scanner::ClassifyIdOrKeyword; classify_token[U_e] = &Scanner::ClassifyIdOrKeyword; classify_token[U_f] = &Scanner::ClassifyIdOrKeyword; classify_token[U_g] = &Scanner::ClassifyIdOrKeyword; classify_token[U_i] = &Scanner::ClassifyIdOrKeyword; classify_token[U_l] = &Scanner::ClassifyIdOrKeyword; classify_token[U_n] = &Scanner::ClassifyIdOrKeyword; classify_token[U_p] = &Scanner::ClassifyIdOrKeyword; classify_token[U_r] = &Scanner::ClassifyIdOrKeyword; classify_token[U_s] = &Scanner::ClassifyIdOrKeyword; classify_token[U_t] = &Scanner::ClassifyIdOrKeyword; classify_token[U_v] = &Scanner::ClassifyIdOrKeyword; classify_token[U_w] = &Scanner::ClassifyIdOrKeyword; classify_token[U_SINGLE_QUOTE] = &Scanner::ClassifyCharLiteral; classify_token[U_DOUBLE_QUOTE] = &Scanner::ClassifyStringLiteral; classify_token[U_PLUS] = &Scanner::ClassifyPlus; classify_token[U_MINUS] = &Scanner::ClassifyMinus; classify_token[U_EXCLAMATION] = &Scanner::ClassifyNot; classify_token[U_PERCENT] = &Scanner::ClassifyMod; classify_token[U_CARET] = &Scanner::ClassifyXor; classify_token[U_AMPERSAND] = &Scanner::ClassifyAnd; classify_token[U_STAR] = &Scanner::ClassifyStar; classify_token[U_BAR] = &Scanner::ClassifyOr; classify_token[U_TILDE] = &Scanner::ClassifyComplement; classify_token[U_SLASH] = &Scanner::ClassifySlash; classify_token[U_GREATER] = &Scanner::ClassifyGreater; classify_token[U_LESS] = &Scanner::ClassifyLess; classify_token[U_LEFT_PARENTHESIS] = &Scanner::ClassifyLparen; classify_token[U_RIGHT_PARENTHESIS] = &Scanner::ClassifyRparen; classify_token[U_LEFT_BRACE] = &Scanner::ClassifyLbrace; classify_token[U_RIGHT_BRACE] = &Scanner::ClassifyRbrace; classify_token[U_LEFT_BRACKET] = &Scanner::ClassifyLbracket; classify_token[U_RIGHT_BRACKET] = &Scanner::ClassifyRbracket; classify_token[U_SEMICOLON] = &Scanner::ClassifySemicolon; classify_token[U_QUESTION] = &Scanner::ClassifyQuestion; classify_token[U_COLON] = &Scanner::ClassifyColon; classify_token[U_COMMA] = &Scanner::ClassifyComma; classify_token[U_DOT] = &Scanner::ClassifyPeriod; classify_token[U_EQUAL] = &Scanner::ClassifyEqual; return; } // // Associate a lexical stream with this file // void Scanner::Initialize(FileSymbol *file_symbol) { lex = new LexStream(control, file_symbol); lex -> Reset(); LexStream::Token *current_token = &(lex -> token_stream.Next()); // add 0th token ! current_token -> SetKind(0); current_token -> SetLocation(0); current_token -> SetSymbol(NULL); if (control.option.comments) { LexStream::Comment *current_comment = &(lex -> comment_stream.Next()); // add 0th comment ! current_comment -> string = NULL; current_comment -> length = 0; current_comment -> previous_token = -1; // No token precedes this comment current_comment -> location = 0; } lex -> line_location.Next() = 0; // mark starting location of line # 0 return; } // // This is one of the main entry point for the Java lexical analyser. // Its input is the name of a regular text file. Its output is a stream // of tokens. // void Scanner::SetUp(FileSymbol *file_symbol) { Initialize(file_symbol); lex -> CompressSpace(); file_symbol -> lex_stream = lex; return; } // // This is one of the main entry point for the Java lexical analyser. // Its input is the name of a regular text file. Its output is a stream // of tokens. // void Scanner::Scan(FileSymbol *file_symbol) { Initialize(file_symbol); lex -> ReadInput(); cursor = lex -> InputBuffer(); if (cursor) { Scan(); lex -> CompressSpace(); // // // if (control.option.dump_errors) { lex -> SortMessages(); for (int i = 0; i < lex -> bad_tokens.Length(); i++) lex -> PrintEmacsMessage(i); cout.flush(); } lex -> DestroyInput(); // get rid of input buffer } else { delete lex; lex = NULL; } file_symbol -> lex_stream = lex; return; } // // Scan the InputBuffer() and process all tokens and comments. // void Scanner::Scan() { wchar_t *input_buffer_tail = &cursor[lex -> InputBufferLength()]; // // CURSOR is assumed to point to the next character to be scanned. // Using CURSOR,we jump to the proper classification function // which scans and classifies the token and returns the location of // the character immediately following it. // do { SkipSpaces(); (this ->* classify_token[*cursor < 128 ? *cursor : 128])(); } while (cursor < input_buffer_tail); // // Add a a gate after the last line. // lex -> line_location.Next() = input_buffer_tail - lex -> InputBuffer(); // // If the brace_stack is not empty, then there are unmatched left // braces in the input. Each unmatched left brace should point to // the EOF token as a substitute for a matching right brace. // for (LexStream::TokenIndex left_brace = brace_stack.Top(); left_brace; left_brace = brace_stack.Top()) { lex -> token_stream[left_brace].SetRightBrace(lex -> token_stream.Length() - 1); brace_stack.Pop(); } return; } // // CURSOR points to the starting position of a comment. Scan the // the comment and return the location of the character immediately // following it. CURSOR may be destroyed. // void Scanner::ScanStarComment() { LexStream::Comment *current_comment = (control.option.comments ? &(lex -> comment_stream.Next()) : new LexStream::Comment()); current_comment -> string = NULL; current_comment -> previous_token = lex -> token_stream.Length() - 2; // the token that precedes this comment current_comment -> location = cursor - lex -> InputBuffer(); cursor += 2; for (;;) { while (*cursor != U_STAR && (! Code::IsNewline(*cursor)) && *cursor != U_CTL_Z) cursor++; if (*cursor == U_STAR) // Potential comment closer { while (*++cursor == U_STAR) ; if (*cursor == U_SLASH) { cursor++; current_comment -> length = (cursor - lex -> InputBuffer()) - current_comment -> location; if (! control.option.comments) delete current_comment; return; } } else if (Code::IsNewline(*cursor)) // Record new line { cursor++; lex -> line_location.Next() = cursor - lex -> InputBuffer(); } else break; } lex -> bad_tokens.Next().Initialize(StreamError::UNTERMINATED_COMMENT, current_comment -> location, (unsigned) (cursor - lex -> InputBuffer()) - 1); current_comment -> length = (cursor - lex -> InputBuffer()) - current_comment -> location; if (! control.option.comments) delete current_comment; return; } // // // void Scanner::ScanSlashComment() { if (control.option.comments) { LexStream::Comment *current_comment = &(lex -> comment_stream.Next()); current_comment -> string = NULL; current_comment -> previous_token = lex -> token_stream.Length() - 2; // the token that precedes this comment current_comment -> location = cursor - lex -> InputBuffer(); for (cursor += 2; ! Code::IsNewline(*cursor); cursor++) // skip all until \n ; current_comment -> length = (cursor - lex -> InputBuffer()) - current_comment -> location; } else { for (cursor += 2; ! Code::IsNewline(*cursor); cursor++) // skip all until \n ; } return; } // // This procedure is invoked to skip useless spaces in the input. // It assumes upon entry that CURSOR points to the next character to // be scanned. Before returning it sets CURSOR to the location of the // first non-space character following its initial position. // inline void Scanner::SkipSpaces() { do { while (Code::IsSpaceButNotNewline(*cursor)) cursor++; while (Code::IsNewline(*cursor)) // starting a new line? { cursor++; lex -> line_location.Next() = cursor - lex -> InputBuffer(); while (Code::IsSpaceButNotNewline(*cursor)) cursor++; } while (*cursor == U_SLASH) { if (cursor[1] == U_STAR) ScanStarComment(); else if (cursor[1] == U_SLASH) ScanSlashComment(); else break; } } while (Code::IsSpace(*cursor)); return; } /**********************************************************************/ /**********************************************************************/ /** **/ /** scan_keyword(i): **/ /** **/ /**********************************************************************/ /**********************************************************************/ /** **/ /** Scan an identifier of length I and determine if it is a keyword. **/ /** **/ /**********************************************************************/ /**********************************************************************/ int Scanner::ScanKeyword0(wchar_t *p1) { return TK_Identifier; } int Scanner::ScanKeyword2(wchar_t *p1) { if (p1[0] == U_d && p1[1] == U_o) return TK_do; else if (p1[0] == U_i && p1[1] == U_f) return TK_if; return TK_Identifier; } int Scanner::ScanKeyword3(wchar_t *p1) { switch(*p1) { case U_f: if (p1[1] == U_o && p1[2] == U_r) return TK_for; break; case U_i: if (p1[1] == U_n && p1[2] == U_t) return TK_int; break; case U_n: if (p1[1] == U_e && p1[2] == U_w) return TK_new; break; case U_t: if (p1[1] == U_r && p1[2] == U_y) return TK_try; break; } return TK_Identifier; } int Scanner::ScanKeyword4(wchar_t *p1) { switch (*p1) { case U_b: if (p1[1] == U_y && p1[2] == U_t && p1[3] == U_e) return TK_byte; break; case U_c: if (p1[1] == U_a && p1[2] == U_s && p1[3] == U_e) return TK_case; else if (p1[1] == U_h && p1[2] == U_a && p1[3] == U_r) return TK_char; break; case U_e: if (p1[1] == U_l && p1[2] == U_s && p1[3] == U_e) return TK_else; break; case U_g: if (p1[1] == U_o && p1[2] == U_t && p1[3] == U_o) return TK_goto; break; case U_l: if (p1[1] == U_o && p1[2] == U_n && p1[3] == U_g) return TK_long; break; case U_n: if (p1[1] == U_u && p1[2] == U_l && p1[3] == U_l) return TK_null; break; case U_t: if (p1[1] == U_h && p1[2] == U_i && p1[3] == U_s) return TK_this; else if (p1[1] == U_r && p1[2] == U_u && p1[3] == U_e) return TK_true; break; case U_v: if (p1[1] == U_o && p1[2] == U_i && p1[3] == U_d) return TK_void; break; } return TK_Identifier; } int Scanner::ScanKeyword5(wchar_t *p1) { switch (*p1) { case U_b: if (p1[1] == U_r && p1[2] == U_e && p1[3] == U_a && p1[4] == U_k) return TK_break; break; case U_c: if (p1[1] == U_a && p1[2] == U_t && p1[3] == U_c && p1[4] == U_h) return TK_catch; else if (p1[1] == U_l && p1[2] == U_a && p1[3] == U_s && p1[4] == U_s) return TK_class; else if (p1[1] == U_o && p1[2] == U_n && p1[3] == U_s && p1[4] == U_t) return TK_const; break; case U_f: if (p1[1] == U_a && p1[2] == U_l && p1[3] == U_s && p1[4] == U_e) return TK_false; else if (p1[1] == U_i && p1[2] == U_n && p1[3] == U_a && p1[4] == U_l) return TK_final; else if (p1[1] == U_l && p1[2] == U_o && p1[3] == U_a && p1[4] == U_t) return TK_float; break; case U_s: if (p1[1] == U_h && p1[2] == U_o && p1[3] == U_r && p1[4] == U_t) return TK_short; else if (p1[1] == U_u && p1[2] == U_p && p1[3] == U_e && p1[4] == U_r) return TK_super; break; case U_t: if (p1[1] == U_h && p1[2] == U_r && p1[3] == U_o && p1[4] == U_w) return TK_throw; break; case U_w: if (p1[1] == U_h && p1[2] == U_i && p1[3] == U_l && p1[4] == U_e) return TK_while; break; } return TK_Identifier; } int Scanner::ScanKeyword6(wchar_t *p1) { switch (*p1) { case U_d: if (p1[1] == U_o && p1[2] == U_u && p1[3] == U_b && p1[4] == U_l && p1[5] == U_e) return TK_double; break; case U_i: if (p1[1] == U_m && p1[2] == U_p && p1[3] == U_o && p1[4] == U_r && p1[5] == U_t) return TK_import; break; case U_n: if (p1[1] == U_a && p1[2] == U_t && p1[3] == U_i && p1[4] == U_v && p1[5] == U_e) return TK_native; break; case U_p: if (p1[1] == U_u && p1[2] == U_b && p1[3] == U_l && p1[4] == U_i && p1[5] == U_c) return TK_public; break; case U_r: if (p1[1] == U_e && p1[2] == U_t && p1[3] == U_u && p1[4] == U_r && p1[5] == U_n) return TK_return; break; case U_s: if (p1[1] == U_t && p1[2] == U_a && p1[3] == U_t && p1[4] == U_i && p1[5] == U_c) return TK_static; else if (p1[1] == U_w && p1[2] == U_i && p1[3] == U_t && p1[4] == U_c && p1[5] == U_h) return TK_switch; break; case U_t: if (p1[1] == U_h && p1[2] == U_r && p1[3] == U_o && p1[4] == U_w && p1[5] == U_s) return TK_throws; break; } return TK_Identifier; } int Scanner::ScanKeyword7(wchar_t *p1) { switch(*p1) { case U_b: if (p1[1] == U_o && p1[2] == U_o && p1[3] == U_l && p1[4] == U_e && p1[5] == U_a && p1[6] == U_n) return TK_boolean; case U_d: if (p1[1] == U_e && p1[2] == U_f && p1[3] == U_a && p1[4] == U_u && p1[5] == U_l && p1[6] == U_t) return TK_default; break; case U_e: if (p1[1] == U_x && p1[2] == U_t && p1[3] == U_e && p1[4] == U_n && p1[5] == U_d && p1[6] == U_s) return TK_extends; break; case U_f: if (p1[1] == U_i && p1[2] == U_n && p1[3] == U_a && p1[4] == U_l && p1[5] == U_l && p1[6] == U_y) return TK_finally; break; case U_p: if (p1[1] == U_a && p1[2] == U_c && p1[3] == U_k && p1[4] == U_a && p1[5] == U_g && p1[6] == U_e) return TK_package; else if (p1[1] == U_r && p1[2] == U_i && p1[3] == U_v && p1[4] == U_a && p1[5] == U_t && p1[6] == U_e) return TK_private; break; } return TK_Identifier; } int Scanner::ScanKeyword8(wchar_t *p1) { switch(*p1) { case U_a: if (p1[1] == U_b && p1[2] == U_s && p1[3] == U_t && p1[4] == U_r && p1[5] == U_a && p1[6] == U_c && p1[7] == U_t) return TK_abstract; break; case U_c: if (p1[1] == U_o && p1[2] == U_n && p1[3] == U_t && p1[4] == U_i && p1[5] == U_n && p1[6] == U_u && p1[7] == U_e) return TK_continue; break; case U_s: if (p1[1] == U_t && p1[2] == U_r && p1[3] == U_i && p1[4] == U_c && p1[5] == U_t && p1[6] == U_f && p1[7] == U_p) return TK_strictfp; break; case U_v: if (p1[1] == U_o && p1[2] == U_l && p1[3] == U_a && p1[4] == U_t && p1[5] == U_i && p1[6] == U_l && p1[7] == U_e) return TK_volatile; break; } return TK_Identifier; } int Scanner::ScanKeyword9(wchar_t *p1) { if (p1[0] == U_i && p1[1] == U_n && p1[2] == U_t && p1[3] == U_e && p1[4] == U_r && p1[5] == U_f && p1[6] == U_a && p1[7] == U_c && p1[8] == U_e) return TK_interface; else if (p1[0] == U_p && p1[1] == U_r && p1[2] == U_o && p1[3] == U_t && p1[4] == U_e && p1[5] == U_c && p1[6] == U_t && p1[7] == U_e && p1[8] == U_d) return TK_protected; else if (p1[0] == U_t && p1[1] == U_r && p1[2] == U_a && p1[3] == U_n && p1[4] == U_s && p1[5] == U_i && p1[6] == U_e && p1[7] == U_n && p1[8] == U_t) return TK_transient; return TK_Identifier; } int Scanner::ScanKeyword10(wchar_t *p1) { if (p1[0] == U_i && p1[1] == U_m && p1[2] == U_p && p1[3] == U_l && p1[4] == U_e && p1[5] == U_m && p1[6] == U_e && p1[7] == U_n && p1[8] == U_t && p1[9] == U_s) return TK_implements; else if (p1[0] == U_i && p1[1] == U_n && p1[2] == U_s && p1[3] == U_t && p1[4] == U_a && p1[5] == U_n && p1[6] == U_c && p1[7] == U_e && p1[8] == U_o && p1[9] == U_f) return TK_instanceof; return TK_Identifier; } int Scanner::ScanKeyword12(wchar_t *p1) { if (p1[0] == U_s && p1[1] == U_y && p1[2] == U_n && p1[3] == U_c && p1[4] == U_h && p1[5] == U_r && p1[6] == U_o && p1[7] == U_n && p1[8] == U_i && p1[9] == U_z && p1[10] == U_e&& p1[11] == U_d) return TK_synchronized; return TK_Identifier; } /**********************************************************************/ /* CHECK_OctalLiteral: */ /**********************************************************************/ /* Verify that an octal token is legal. If not, issue a message. */ /**********************************************************************/ inline void Scanner::CheckOctalLiteral(wchar_t *cursor, wchar_t *tail) { if (cursor[0] == U_0 && cursor[1] != U_x && cursor[1] != U_X) { wchar_t *p; for (p = cursor + 1; p < tail; p++) { if (*p == U_8 || *p == U_9) break; } if (p < tail) lex -> bad_tokens.Next().Initialize(StreamError::BAD_OCTAL_CONSTANT, (unsigned) (cursor - lex -> InputBuffer()), (unsigned) (tail - lex -> InputBuffer()) - 1); } return; } /**********************************************************************/ /* ClassifyCharLiteral: */ /**********************************************************************/ /* This procedure is invoked to scan a character literal or a large */ /* character literal. A large character literal is preceded by the */ /* letter L (capital L). After the character literal has been scanned */ /* and classified, it is entered in the table without its closing */ /* quote but with the opening quote (preceded by L if it's a large */ /* character literal). */ /**********************************************************************/ void Scanner::ClassifyCharLiteral() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_CharacterLiteral); wchar_t *ptr = cursor + 1; while (*ptr != U_SINGLE_QUOTE && (! Code::IsNewline(*ptr))) { if (*ptr++ == U_BACKSLASH) // In any case, skip the character { // If it was a backslash, if (! Code::IsNewline(*ptr)) // if the next char is not eol, skip it. ptr++; } } int len = ptr - cursor; if (*ptr == U_SINGLE_QUOTE) { if (len == 1) lex -> bad_tokens.Next().Initialize(StreamError::EMPTY_CHARACTER_CONSTANT, current_token -> Location(), (unsigned) (ptr - lex -> InputBuffer())); ptr++; } else { if (len == 1) /* Definitely, an isolated quote */ current_token -> SetKind(0); lex -> bad_tokens.Next().Initialize(StreamError::UNTERMINATED_CHARACTER_CONSTANT, current_token -> Location(), (unsigned) (ptr - lex -> InputBuffer()) - 1); } current_token -> SetSymbol(control.char_table.FindOrInsertLiteral(cursor, ptr - cursor)); cursor = ptr; return; } /**********************************************************************/ /* CLASSIFY_STRINGLITERAL: */ /**********************************************************************/ /* This procedure is invoked to scan a string literal or a large */ /* string literal. A large string literal is preceded by the letter */ /* L (capital L). After the string literal has been scanned and */ /* classified, it is entered in the table without its closing double */ /* quote but with the opening quote (preceded by L if it's a large */ /* string literal). */ /**********************************************************************/ void Scanner::ClassifyStringLiteral() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_StringLiteral); wchar_t *ptr = cursor + 1; while (*ptr != U_DOUBLE_QUOTE && (! Code::IsNewline(*ptr))) { if (*ptr++ == U_BACKSLASH) // In any case, skip the character { // If it was a backslash, if (! Code::IsNewline(*ptr)) // if the next char is not eol, skip it. ptr++; } } if (*ptr == U_DOUBLE_QUOTE) ptr++; else { if ((ptr - cursor) == 1) /* Definitely, an isolated double quote */ current_token -> SetKind(0); lex -> bad_tokens.Next().Initialize(StreamError::UNTERMINATED_STRING_CONSTANT, current_token -> Location(), (unsigned) (ptr - lex -> InputBuffer()) - 1); } current_token -> SetSymbol(control.string_table.FindOrInsertLiteral(cursor, ptr - cursor)); cursor = ptr; return; } /**********************************************************************/ /* CLASSIFYIDORKEYWORD: */ /**********************************************************************/ /* This procedure is invoked when CURSOR points to one of the */ /* following characters: */ /* */ /* 'a' */ /* 'b' */ /* 'c' */ /* 'd' */ /* 'e' */ /* 'f' */ /* 'g' */ /* 'i' */ /* 'l' */ /* 'n' */ /* 'o' */ /* 'p' */ /* 'r' */ /* 's' */ /* 't' */ /* 'v' */ /* 'w' */ /* */ /* It scans the identifier and checks whether or not it is a keyword. */ /* */ /* NOTE that the use of that check is a time-optimization that is not */ /* required for correctness. */ /**********************************************************************/ void Scanner::ClassifyIdOrKeyword() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); wchar_t *ptr = cursor + 1; while (Code::IsAlnum(*ptr)) ptr++; int len = ptr - cursor; current_token -> SetKind(len < 13 ? (scan_keyword[len])(cursor) : TK_Identifier); if (current_token -> Kind() == TK_Identifier) { current_token -> SetSymbol((NameSymbol *) control.FindOrInsertName(cursor, len)); for (int i = 0; i < control.option.keyword_map.Length(); i++) { if (control.option.keyword_map[i].length == len && wcsncmp(cursor, control.option.keyword_map[i].name, len) == 0) current_token -> SetKind(control.option.keyword_map[i].key); } } else { current_token -> SetSymbol(NULL); if (current_token -> Kind() == TK_class || current_token -> Kind() == TK_interface) { if (brace_stack.Size() == 0) // This type keyword is not nested. lex -> type_index.Next() = lex -> token_stream.Length() - 1; } } cursor = ptr; return; } /**********************************************************************/ /* CLASSIFY_ID: */ /**********************************************************************/ /* This procedure is invoked when CURSOR points to an alphabetic */ /* character other than the ones identified above or '$' or '_'. */ /* A token that starts with one of these letters is an identifier. */ /**********************************************************************/ void Scanner::ClassifyId() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); wchar_t *ptr = cursor + 1; while (Code::IsAlnum(*ptr)) ptr++; int len = ptr - cursor; current_token -> SetKind(TK_Identifier); current_token -> SetSymbol(control.FindOrInsertName(cursor, len)); for (int i = 0; i < control.option.keyword_map.Length(); i++) { if (control.option.keyword_map[i].length == len && wcsncmp(cursor, control.option.keyword_map[i].name, len) == 0) current_token -> SetKind(control.option.keyword_map[i].key); } cursor = ptr; return; } /**********************************************************************/ /* CLASSIFY_NUMERICLITERAL: */ /**********************************************************************/ /* This procedure is invoked when CURSOR points directly to one of */ /* the characters below or to a '.' followed by one of the characters */ /* below: */ /* */ /* case '0': case '1': case '2': case '3': case '4': */ /* case '5': case '6': case '7': case '8': case '9': */ /* */ /* Such a token is classified as a numeric literal: */ /* */ /* TK_LongLiteral, TK_IntegerLiteral, */ /* TK_DOUBLELiteral, TK_FloatingPointLiteral */ /**********************************************************************/ void Scanner::ClassifyNumericLiteral() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); /******************************************************************/ /* Scan the initial sequence of digits if any. */ /******************************************************************/ wchar_t *ptr; for (ptr = cursor; Code::IsDigit(*ptr); ptr++) ; /******************************************************************/ /* We now take an initial crack at classifying the numeric token. */ /* we have four cases to consider. */ /* */ /* 1) If the initial (perhaps an empty) sequence of digits is */ /* followed by a period ('.'), we have a floating-constant. */ /* We scan the sequence of digits (if any) that follows the */ /* period. */ /* */ /* 2) Otherwise, we hava an integer literal. */ /* */ /* If the initial (can't be empty) sequence of digits start */ /* with "0x" or "0X" we have a hexadecimal constant: */ /* continue scanning all hex-digits that follow the 'x'. */ /******************************************************************/ if (*ptr == U_DOT) { current_token -> SetKind(TK_DoubleLiteral); for (ptr++; Code::IsDigit(*ptr); ptr++) ; } else { current_token -> SetKind(TK_IntegerLiteral); if (*cursor == U_0 && (cursor[1] == U_x || cursor[1] == U_X)) { ptr = cursor + 2; if (isxdigit(*ptr)) { for (ptr++; isxdigit(*ptr); ptr++) ; } else lex -> bad_tokens.Next().Initialize(StreamError::INVALID_HEX_CONSTANT, current_token -> Location(), (unsigned) (ptr - lex -> InputBuffer()) - 1); } } /******************************************************************/ /* If the initial numeric token is followed by an exponent, then */ /* it is a floating-constant. If that's the case, the literal is */ /* reclassified ant the exponent is scanned. */ /* */ /* NOTE that as 'E' and 'e' are legitimate hexadecimal digits, we */ /* don't have to worry about a hexadecimal constant being used as */ /* the prefix of a floating-constant. E.g., 0x123e12 is tokenized */ /* as a single hexadecimal digit. The string 0x123e+12 gets */ /* broken down as the hex number 0x123e, the operator '+' and the */ /* decimal constant 12. */ /******************************************************************/ if (*ptr == U_e || *ptr == U_E) { current_token -> SetKind(TK_DoubleLiteral); ptr++; /* Skip the 'e' or 'E' */ if (*ptr == U_PLUS || *ptr == U_MINUS) ptr++; /* Skip the '+' or '-' */ if (Code::IsDigit(*ptr)) { for (ptr++; Code::IsDigit(*ptr); ptr++) ; } else lex -> bad_tokens.Next().Initialize(StreamError::INVALID_FLOATING_CONSTANT_EXPONENT, current_token -> Location(), (unsigned) (ptr - lex -> InputBuffer()) - 1); } /******************************************************************/ /* A numeric constant may be suffixed by a letter that further */ /* qualifies what kind of a constant it is. We check for these */ /* suffixes here. */ /******************************************************************/ int len; if (*ptr == U_f || *ptr == U_F) { ptr++; len = ptr - cursor; current_token -> SetSymbol(control.float_table.FindOrInsertLiteral(cursor, len)); current_token -> SetKind(TK_FloatingPointLiteral); } else if (*ptr == U_d || *ptr == U_D) { ptr++; len = ptr - cursor; current_token -> SetSymbol(control.double_table.FindOrInsertLiteral(cursor, len)); current_token -> SetKind(TK_DoubleLiteral); } else if (current_token -> Kind() == TK_IntegerLiteral) { if (*ptr == U_l || *ptr == U_L) { ptr++; /* Skip the 'l' or 'L' */ len = ptr - cursor; current_token -> SetSymbol(control.long_table.FindOrInsertLiteral(cursor, len)); current_token -> SetKind(TK_LongLiteral); } else { len = ptr - cursor; current_token -> SetSymbol(control.int_table.FindOrInsertLiteral(cursor, len)); } CheckOctalLiteral(cursor, ptr); } else { len = ptr - cursor; current_token -> SetSymbol(control.double_table.FindOrInsertLiteral(cursor, len)); current_token -> SetKind(TK_DoubleLiteral); } /******************************************************************/ /* We now have scanned the complete token and it has been properly*/ /* classified. CURSOR points to its first character in the buffer */ /* and PTR points to the character immediately following it. We */ /* insert the name into the name table and if the token is an */ /* octal constant, we check that all the digits in its name are */ /* in the range 0-7. */ /******************************************************************/ cursor = ptr; return; } /**********************************************************************/ /* CLASSIFY_COLON: */ /**********************************************************************/ void Scanner::ClassifyColon() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_COLON); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_PLUS: */ /**********************************************************************/ void Scanner::ClassifyPlus() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_PLUS) { cursor++; current_token -> SetKind(TK_PLUS_PLUS); } else if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_PLUS_EQUAL); } else current_token -> SetKind(TK_PLUS); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_MINUS: */ /**********************************************************************/ void Scanner::ClassifyMinus() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_MINUS) { cursor++; current_token -> SetKind(TK_MINUS_MINUS); } else if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_MINUS_EQUAL); } else current_token -> SetKind(TK_MINUS); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_STAR: */ /**********************************************************************/ void Scanner::ClassifyStar() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_MULTIPLY_EQUAL); } else current_token -> SetKind(TK_MULTIPLY); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_SLASH: */ /**********************************************************************/ void Scanner::ClassifySlash() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_DIVIDE_EQUAL); } else current_token -> SetKind(TK_DIVIDE); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_LESS: */ /**********************************************************************/ void Scanner::ClassifyLess() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_LESS_EQUAL); } else if (*cursor == U_LESS) { cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_LEFT_SHIFT_EQUAL); } else current_token -> SetKind(TK_LEFT_SHIFT); } else current_token -> SetKind(TK_LESS); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_GREATER: */ /**********************************************************************/ void Scanner::ClassifyGreater() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_GREATER_EQUAL); } else if (*cursor == U_GREATER) { cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_RIGHT_SHIFT_EQUAL); } else if (*cursor == U_GREATER) { cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_UNSIGNED_RIGHT_SHIFT_EQUAL); } else current_token -> SetKind(TK_UNSIGNED_RIGHT_SHIFT); } else current_token -> SetKind(TK_RIGHT_SHIFT); } else current_token -> SetKind(TK_GREATER); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_AND: */ /**********************************************************************/ void Scanner::ClassifyAnd() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_AMPERSAND) { cursor++; current_token -> SetKind(TK_AND_AND); } else if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_AND_EQUAL); } else current_token -> SetKind(TK_AND); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_OR: */ /**********************************************************************/ void Scanner::ClassifyOr() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_BAR) { cursor++; current_token -> SetKind(TK_OR_OR); } else if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_OR_EQUAL); } else current_token -> SetKind(TK_OR); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_XOR: */ /**********************************************************************/ void Scanner::ClassifyXor() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_XOR_EQUAL); } else current_token -> SetKind(TK_XOR); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_NOT: */ /**********************************************************************/ void Scanner::ClassifyNot() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_NOT_EQUAL); } else current_token -> SetKind(TK_NOT); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_EQUAL: */ /**********************************************************************/ void Scanner::ClassifyEqual() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_EQUAL_EQUAL); } else current_token -> SetKind(TK_EQUAL); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_MOD: */ /**********************************************************************/ void Scanner::ClassifyMod() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); cursor++; if (*cursor == U_EQUAL) { cursor++; current_token -> SetKind(TK_REMAINDER_EQUAL); } else current_token -> SetKind(TK_REMAINDER); current_token -> SetSymbol(NULL); return; } /**********************************************************************/ /* CLASSIFY_PERIOD: */ /**********************************************************************/ void Scanner::ClassifyPeriod() { if (Code::IsDigit(cursor[1])) // Is period immediately followed by digit? ClassifyNumericLiteral(); else { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_DOT); current_token -> SetSymbol(NULL); cursor++; } return; } /**********************************************************************/ /* CLASSIFY_SEMICOLON: */ /**********************************************************************/ void Scanner::ClassifySemicolon() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_SEMICOLON); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_COMMA: */ /**********************************************************************/ void Scanner::ClassifyComma() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_COMMA); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_LBRACE: */ /**********************************************************************/ void Scanner::ClassifyLbrace() { // // Instead of setting the symbol for a left brace, we keep track of it. // When we encounter its matching right brace, we use the symbol field // to identify its counterpart. // brace_stack.Push(lex -> token_stream.Length()); LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_LBRACE); cursor++; return; } /**********************************************************************/ /* CLASSIFY_RBRACE: */ /**********************************************************************/ void Scanner::ClassifyRbrace() { // // When a left brace in encountered, it is pushed into the brace_stack. // When its matching right brace in encountered, we pop the left brace // and make it point to its matching right brace. // LexStream::TokenIndex left_brace = brace_stack.Top(); if (left_brace) // This right brace is matched by a left one { lex -> token_stream[left_brace].SetRightBrace(lex -> token_stream.Length()); brace_stack.Pop(); } LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_RBRACE); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_LPAREN: */ /**********************************************************************/ void Scanner::ClassifyLparen() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_LPAREN); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_RPAREN: */ /**********************************************************************/ void Scanner::ClassifyRparen() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_RPAREN); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_LBRACKET: */ /**********************************************************************/ void Scanner::ClassifyLbracket() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_LBRACKET); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_RBRACKET: */ /**********************************************************************/ void Scanner::ClassifyRbracket() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_RBRACKET); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_COMPLEMENT: */ /**********************************************************************/ void Scanner::ClassifyComplement() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_TWIDDLE); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_BAD_TOKEN: */ /**********************************************************************/ void Scanner::ClassifyBadToken() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(0); current_token -> SetSymbol(control.FindOrInsertName(cursor, 1)); if (++cursor < &lex -> InputBuffer()[lex -> InputBufferLength()]) // not the terminating character? lex -> bad_tokens.Next().Initialize(StreamError::BAD_TOKEN, current_token -> Location(), current_token -> Location()); else current_token -> SetKind(TK_EOF); return; } /**********************************************************************/ /* CLASSIFY_QUESTION: */ /**********************************************************************/ /**********************************************************************/ void Scanner::ClassifyQuestion() { LexStream::Token *current_token = &(lex -> token_stream.Next()); current_token -> SetLocation(cursor - lex -> InputBuffer()); current_token -> SetKind(TK_QUESTION); current_token -> SetSymbol(NULL); cursor++; return; } /**********************************************************************/ /* CLASSIFY_NONASCIIUNICODE: */ /**********************************************************************/ void Scanner::ClassifyNonAsciiUnicode() { if (Code::IsAlpha(*cursor)) // Some kind of non-ascii unicode letter ClassifyId(); else ClassifyBadToken(); return; }