Line data Source code
1 : /* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 2 : * Use of this file is governed by the BSD 3-clause license that 3 : * can be found in the LICENSE.txt file in the project root. 4 : */ 5 : 6 : #pragma once 7 : 8 : #include "Recognizer.h" 9 : #include "TokenSource.h" 10 : #include "CharStream.h" 11 : #include "Token.h" 12 : 13 : namespace antlr4 { 14 : 15 : /// A lexer is recognizer that draws input symbols from a character stream. 16 : /// lexer grammars result in a subclass of this object. A Lexer object 17 : /// uses simplified match() and error recovery mechanisms in the interest 18 : /// of speed. 19 : class ANTLR4CPP_PUBLIC Lexer : public Recognizer, public TokenSource { 20 : public: 21 : static const size_t DEFAULT_MODE = 0; 22 : static const size_t MORE = static_cast<size_t>(-2); 23 : static const size_t SKIP = static_cast<size_t>(-3); 24 : 25 : static const size_t DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL; 26 : static const size_t HIDDEN = Token::HIDDEN_CHANNEL; 27 : static const size_t MIN_CHAR_VALUE = 0; 28 : static const size_t MAX_CHAR_VALUE = 0x10FFFF; 29 : 30 : CharStream *_input; // Pure reference, usually from statically allocated instance. 31 : 32 : protected: 33 : /// How to create token objects. 34 : Ref<TokenFactory<CommonToken>> _factory; 35 : 36 : public: 37 : /// The goal of all lexer rules/methods is to create a token object. 38 : /// This is an instance variable as multiple rules may collaborate to 39 : /// create a single token. nextToken will return this object after 40 : /// matching lexer rule(s). If you subclass to allow multiple token 41 : /// emissions, then set this to the last token to be matched or 42 : /// something nonnull so that the auto token emit mechanism will not 43 : /// emit another token. 44 : 45 : // Life cycle of a token is this: 46 : // Created by emit() (via the token factory) or by action code, holding ownership of it. 47 : // Ownership is handed over to the token stream when calling nextToken(). 48 : std::unique_ptr<Token> token; 49 : 50 : /// <summary> 51 : /// What character index in the stream did the current token start at? 52 : /// Needed, for example, to get the text for current token. Set at 53 : /// the start of nextToken. 54 : /// </summary> 55 : size_t tokenStartCharIndex; 56 : 57 : /// <summary> 58 : /// The line on which the first character of the token resides </summary> 59 : size_t tokenStartLine; 60 : 61 : /// The character position of first character within the line. 62 : size_t tokenStartCharPositionInLine; 63 : 64 : /// Once we see EOF on char stream, next token will be EOF. 65 : /// If you have DONE : EOF ; then you see DONE EOF. 66 : bool hitEOF; 67 : 68 : /// The channel number for the current token. 69 : size_t channel; 70 : 71 : /// The token type for the current token. 72 : size_t type; 73 : 74 : // Use the vector as a stack. 75 : std::vector<size_t> modeStack; 76 : size_t mode; 77 : 78 : Lexer(); 79 : Lexer(CharStream *input); 80 0 : virtual ~Lexer() {} 81 : 82 : virtual void reset(); 83 : 84 : /// Return a token from this source; i.e., match a token on the char stream. 85 : virtual std::unique_ptr<Token> nextToken() override; 86 : 87 : /// Instruct the lexer to skip creating a token for current lexer rule 88 : /// and look for another token. nextToken() knows to keep looking when 89 : /// a lexer rule finishes with token set to SKIP_TOKEN. Recall that 90 : /// if token == null at end of any token rule, it creates one for you 91 : /// and emits it. 92 : virtual void skip(); 93 : virtual void more(); 94 : virtual void setMode(size_t m); 95 : virtual void pushMode(size_t m); 96 : virtual size_t popMode(); 97 : 98 : template<typename T1> 99 : void setTokenFactory(TokenFactory<T1> *factory) { 100 : this->_factory = factory; 101 : } 102 : 103 : virtual Ref<TokenFactory<CommonToken>> getTokenFactory() override; 104 : 105 : /// Set the char stream and reset the lexer 106 : virtual void setInputStream(IntStream *input) override; 107 : 108 : virtual std::string getSourceName() override; 109 : 110 : virtual CharStream* getInputStream() override; 111 : 112 : /// By default does not support multiple emits per nextToken invocation 113 : /// for efficiency reasons. Subclasses can override this method, nextToken, 114 : /// and getToken (to push tokens into a list and pull from that list 115 : /// rather than a single variable as this implementation does). 116 : virtual void emit(std::unique_ptr<Token> newToken); 117 : 118 : /// The standard method called to automatically emit a token at the 119 : /// outermost lexical rule. The token object should point into the 120 : /// char buffer start..stop. If there is a text override in 'text', 121 : /// use that to set the token's text. Override this method to emit 122 : /// custom Token objects or provide a new factory. 123 : virtual Token* emit(); 124 : 125 : virtual Token* emitEOF(); 126 : 127 : virtual size_t getLine() const override; 128 : 129 : virtual size_t getCharPositionInLine() override; 130 : 131 : virtual void setLine(size_t line); 132 : 133 : virtual void setCharPositionInLine(size_t charPositionInLine); 134 : 135 : /// What is the index of the current character of lookahead? 136 : virtual size_t getCharIndex(); 137 : 138 : /// Return the text matched so far for the current token or any 139 : /// text override. 140 : virtual std::string getText(); 141 : 142 : /// Set the complete text of this token; it wipes any previous 143 : /// changes to the text. 144 : virtual void setText(const std::string &text); 145 : 146 : /// Override if emitting multiple tokens. 147 : virtual std::unique_ptr<Token> getToken(); 148 : 149 : virtual void setToken(std::unique_ptr<Token> newToken); 150 : 151 : virtual void setType(size_t ttype); 152 : 153 : virtual size_t getType(); 154 : 155 : virtual void setChannel(size_t newChannel); 156 : 157 : virtual size_t getChannel(); 158 : 159 : virtual const std::vector<std::string>& getChannelNames() const = 0; 160 : 161 : virtual const std::vector<std::string>& getModeNames() const = 0; 162 : 163 : /// Return a list of all Token objects in input char stream. 164 : /// Forces load of all tokens. Does not include EOF token. 165 : virtual std::vector<std::unique_ptr<Token>> getAllTokens(); 166 : 167 : virtual void recover(const LexerNoViableAltException &e); 168 : 169 : virtual void notifyListeners(const LexerNoViableAltException &e); 170 : 171 : virtual std::string getErrorDisplay(const std::string &s); 172 : 173 : /// Lexers can normally match any char in it's vocabulary after matching 174 : /// a token, so do the easy thing and just kill a character and hope 175 : /// it all works out. You can instead use the rule invocation stack 176 : /// to do sophisticated error recovery if you are in a fragment rule. 177 : virtual void recover(RecognitionException *re); 178 : 179 : /// <summary> 180 : /// Gets the number of syntax errors reported during parsing. This value is 181 : /// incremented each time <seealso cref="#notifyErrorListeners"/> is called. 182 : /// </summary> 183 : /// <seealso cref= #notifyListeners </seealso> 184 : virtual size_t getNumberOfSyntaxErrors(); 185 : 186 : protected: 187 : /// You can set the text for the current token to override what is in 188 : /// the input char buffer (via setText()). 189 : std::string _text; 190 : 191 : private: 192 : size_t _syntaxErrors; 193 : void InitializeInstanceFields(); 194 : }; 195 : 196 : } // namespace antlr4