Line data Source code
1 : // WARNING: Changes to this file must be contributed back to Sawyer or else they will
2 : // be clobbered by the next update from Sawyer. The Sawyer repository is at
3 : // https://github.com/matzke1/sawyer.
4 :
5 :
6 :
7 :
8 : #ifndef Sawyer_Lexer_H
9 : #define Sawyer_Lexer_H
10 :
11 : #include <Sawyer/AllocatingBuffer.h>
12 : #include <Sawyer/LineVector.h>
13 : #include <Sawyer/Optional.h>
14 : #include <Sawyer/Sawyer.h>
15 :
16 : #include <boost/filesystem.hpp>
17 :
18 : namespace Sawyer {
19 :
20 : namespace Lexer {
21 :
22 : /** Represents one token of input.
23 : *
24 : * Each token has a user-defined type which is some enumerated constant, or no type at all which means the token represents
25 : * the end of the input stream. Tokens do not store their own lexeme, but rather point to the beginning and end of their
26 : * lexeme in the input stream. */
27 : template<typename T>
28 0 : class SAWYER_EXPORT Token {
29 : public:
30 : typedef T TokenEnum;
31 :
32 : private:
33 : Optional<TokenEnum> type_; // empty means EOF
34 : size_t begin_; // beginning character position in the input
35 : size_t end_; // ending (exclusive) character position in the input
36 :
37 : public:
38 : /** Construct an EOF token. */
39 0 : Token(): begin_(0), end_(0) {}
40 :
41 : /** Construct a token.
42 : *
43 : * The token has the specified type and its lexeme are the characters at the specified position in the input stream. The
44 : * @p begin points to the first character of the lexeme and @p end points to one past the last character. A token's
45 : * lexeme is allowed to be the empty string by specifying the same value for @p begin and @p end, however, @p end must
46 : * never be less than @p begin. */
47 0 : Token(TokenEnum type, size_t begin, size_t end)
48 0 : : type_(type), begin_(begin), end_(end) {
49 0 : ASSERT_require(end >= begin);
50 0 : }
51 :
52 : /** Whether this is an EOF token.
53 : *
54 : * An EOF token is a special token that has no type and an empty lexeme. EOF tokens are constructed by the default
55 : * constructor. */
56 0 : bool isEof() const {
57 0 : if (type_)
58 : return false;
59 : return true;
60 : }
61 :
62 : /** Returns the token.
63 : *
64 : * Since EOF tokens have no type, this must not be called for an EOF token. */
65 0 : TokenEnum type() const {
66 0 : return *type_;
67 : }
68 :
69 : /** Token lexeme starting position.
70 : *
71 : * This is the starting offset in the input for the first character of this token's lexeme. */
72 0 : size_t begin() const {
73 : return begin_;
74 : }
75 :
76 : /** Token lexeme ending position.
77 : *
78 : * This is the offset in the input for one position past the last character of this token's lexeme. It is guaranteed to be
79 : * greater than or equal to the @ref begin position. EOF tokens will always have a @ref begin equal to the @ref end, but
80 : * other empty non-EOF tokens are also possible. */
81 0 : size_t end() const {
82 : return end_;
83 : }
84 : };
85 :
86 : /** An ordered list of tokens scanned from input.
87 : *
88 : * A token stream is an ordered list of tokens scanned from an unchanging input stream and consumed in the order they're
89 : * produced. */
90 : template<class T>
91 : class SAWYER_EXPORT TokenStream {
92 : public:
93 : typedef T Token;
94 :
95 : private:
96 : std::string name_; // name of stream (e.g., a file name)
97 : Container::LineVector content_; // line-oriented character contents of the stream
98 : size_t at_; // cursor position in stream
99 : std::vector<Token> tokens_; // current token at [0] plus lookahead
100 :
101 : public:
102 0 : virtual ~TokenStream() {}
103 :
104 : /** Create a token stream from the contents of a file. */
105 : explicit TokenStream(const boost::filesystem::path &fileName)
106 : : name_(fileName.string()), content_(fileName.string()), at_(0) {}
107 :
108 : /** Create a token stream from a string.
109 : *
110 : * The string content is copied into the lexer and thus can be modified after the lexer returns without affecting the
111 : * token stream. */
112 0 : explicit TokenStream(const std::string &inputString)
113 0 : : name_("string"), content_(Container::AllocatingBuffer<size_t, char>::instance(inputString)), at_(0) {}
114 :
115 : /** Create a token stream from a buffer.
116 : *
117 : * The token stream uses the specified buffer, which should not be modified while the token stream is alive. */
118 : explicit TokenStream(const Container::Buffer<size_t, char>::Ptr &buffer)
119 : : name_("string"), content_(buffer), at_(0) {}
120 :
121 : /** Property: Name of stream. */
122 0 : const std::string& name() const {
123 0 : return name_;
124 : }
125 :
126 : /** Return the current token.
127 : *
128 : * The current token will be an EOF token when all tokens are consumed. */
129 0 : const Token& current() {
130 0 : return (*this)[0];
131 : }
132 :
133 : /** Returns true if the stream is at the end.
134 : *
135 : * This is equivalent to obtaining the current toking and checking whether it's the EOF token. */
136 0 : bool atEof() {
137 0 : return current().isEof();
138 : }
139 :
140 : /** Return the current or future token.
141 : *
142 : * The array operator obtains a token from a virtual array whose first element is the current token, second element is one
143 : * past the current token, etc. The array is infinite in length, padded with EOF tokens. */
144 0 : const Token& operator[](size_t lookahead) {
145 0 : static const Token eof_;
146 0 : while (lookahead >= tokens_.size()) {
147 0 : if (!tokens_.empty() && tokens_.back().isEof())
148 : return eof_;
149 0 : tokens_.push_back(scanNextToken(content_, at_/*in,out*/));
150 : }
151 0 : return tokens_[lookahead];
152 : }
153 :
154 : /** Consume some tokens.
155 : *
156 : * Consumes tokens by shifting @p n tokens off the low-end of the virtual array of tokens. It is permissible to consume
157 : * EOF tokens since more will be generated once the end-of-input is reached. */
158 0 : void consume(size_t n = 1) {
159 0 : const Token &t = current();
160 0 : if (t.isEof()) {
161 : // void
162 0 : } else if (n >= tokens_.size()) {
163 0 : tokens_.clear();
164 : } else {
165 0 : tokens_.erase(tokens_.begin(), tokens_.begin() + n);
166 : }
167 0 : }
168 :
169 : /** Return the lexeme for a token.
170 : *
171 : * Consults the input stream to obtain the lexeme for the specified token and converts that part of the stream to a string
172 : * which is returned. The lexeme for an EOF token is an empty string, although other tokens might also have empty
173 : * lexemes. One may query the lexeme for any token regardless of whether it's been consumed; in fact, one can even query
174 : * lexemes for tokens that have never even been seen by the token stream.
175 : *
176 : * The no-argument version returns the lexeme of the current token.
177 : *
178 : * If you're trying to build a fast lexical analyzer, don't call this function to compare a lexeme against some known
179 : * string. Instead, use @ref match, which doesn't require copying.
180 : *
181 : * @{ */
182 0 : std::string lexeme(const Token &t) {
183 0 : if (const char *s = content_.characters(t.begin())) {
184 0 : return std::string(s, t.end() - t.begin());
185 : } else {
186 0 : return "";
187 : }
188 : }
189 0 : std::string lexeme() {
190 0 : return lexeme(current());
191 : }
192 : /** @} */
193 :
194 : /** Determine whether token is a specific type.
195 : *
196 : * This is sometimes easier to call since it gracefully handles EOF tokens. If called with only one argument, the desired
197 : * type, then it checks the current token.
198 : *
199 : * @{ */
200 0 : bool isa(const Token &t, typename Token::TokenEnum type) {
201 0 : return !t.isEof() && t.type() == type;
202 : }
203 :
204 0 : bool isa(typename Token::TokenEnum type) {
205 0 : return isa(current(), type);
206 : }
207 : /** @} */
208 :
209 : /** Determine whether a token matches a string.
210 : *
211 : * Compares the specified string to a token's lexeme and returns true if they are the same. This is faster than obtaining
212 : * the lexeme from a token and comparing to a string since there's no string copying involved with this function.
213 : *
214 : * The no-argument version compares the string with the current tokens' lexeme.
215 : *
216 : * @{ */
217 : bool match(const Token &t, const char *s) {
218 : ASSERT_not_null(s);
219 : size_t n1 = t.end() - t.begin();
220 : size_t n2 = strlen(s);
221 : if (n1 != n2)
222 : return false;
223 : const char *lexeme = content_.characters(t.begin());
224 : return 0 == strncmp(lexeme, s, n1);
225 : }
226 : bool match(const char *s) {
227 : return match(current(), s);
228 : }
229 : /** @} */
230 :
231 : /** Return the line number and offset for an input position.
232 : *
233 : * Returns the zero-origin line number (a.k.a., line index) for the line containing the specified character position, and
234 : * the offset of that character with respect to the beginning of the line. */
235 0 : std::pair<size_t, size_t> location(size_t position) {
236 0 : return content_.location(position);
237 : }
238 :
239 : /** Returns the last line index and character offset. */
240 0 : std::pair<size_t, size_t> locationEof() {
241 0 : size_t nChars = content_.nCharacters();
242 0 : return nChars > 0 ? content_.location(nChars-1) : content_.location(0);
243 : }
244 :
245 : /** Return the entire string for some line index. */
246 0 : std::string lineString(size_t lineIdx) {
247 0 : return content_.lineString(lineIdx);
248 : }
249 :
250 : /** Function that obtains the next token.
251 : *
252 : * Subclasses implement this function to obtain the next token that starts at or after the specified input position. Upon
253 : * return, the function should adjust @p at to point to the next position for scanning a token, which is usually the first
254 : * character after the returned token's lexeme. If the scanner reaches the end of input or any condition that it deems to
255 : * be the end then it should return the EOF token (a default-constructed token), after which this function will not be
256 : * called again. */
257 : virtual Token scanNextToken(const Container::LineVector &content, size_t &at /*in,out*/) = 0;
258 : };
259 :
260 : } // namespace
261 : } // namespace
262 :
263 : #endif
264 :
|