ROSE  0.11.96.11
Lexer.h
1 // WARNING: Changes to this file must be contributed back to Sawyer or else they will
2 // be clobbered by the next update from Sawyer. The Sawyer repository is at
3 // https://github.com/matzke1/sawyer.
4 
5 
6 
7 
8 #ifndef Sawyer_Lexer_H
9 #define Sawyer_Lexer_H
10 
11 #include <Sawyer/AllocatingBuffer.h>
12 #include <Sawyer/LineVector.h>
13 #include <Sawyer/Optional.h>
14 #include <Sawyer/Sawyer.h>
15 
16 #include <boost/filesystem.hpp>
17 
18 namespace Sawyer {
19 
20 namespace Lexer {
21 
27 template<typename T>
28 class SAWYER_EXPORT Token {
29 public:
30  typedef T TokenEnum;
31 
32 private:
33  Optional<TokenEnum> type_; // empty means EOF
34  size_t begin_; // beginning character position in the input
35  size_t end_; // ending (exclusive) character position in the input
36 
37 public:
39  Token(): begin_(0), end_(0) {}
40 
47  Token(TokenEnum type, size_t begin, size_t end)
48  : type_(type), begin_(begin), end_(end) {
49  ASSERT_require(end >= begin);
50  }
51 
56  bool isEof() const {
57  if (type_)
58  return false;
59  return true;
60  }
61 
65  TokenEnum type() const {
66  return *type_;
67  }
68 
72  size_t begin() const {
73  return begin_;
74  }
75 
81  size_t end() const {
82  return end_;
83  }
84 };
85 
90 template<class T>
91 class SAWYER_EXPORT TokenStream {
92 public:
93  typedef T Token;
94 
95 private:
96  std::string name_; // name of stream (e.g., a file name)
97  Container::LineVector content_; // line-oriented character contents of the stream
98  size_t at_; // cursor position in stream
99  std::vector<Token> tokens_; // current token at [0] plus lookahead
100 
101 public:
102  virtual ~TokenStream() {}
103 
105  explicit TokenStream(const boost::filesystem::path &fileName)
106  : name_(fileName.string()), content_(fileName.string()), at_(0) {}
107 
112  explicit TokenStream(const std::string &inputString)
113  : name_("string"), content_(Container::AllocatingBuffer<size_t, char>::instance(inputString)), at_(0) {}
114 
119  : name_("string"), content_(buffer), at_(0) {}
120 
122  const std::string& name() const {
123  return name_;
124  }
125 
129  const Token& current() {
130  return (*this)[0];
131  }
132 
136  bool atEof() {
137  return current().isEof();
138  }
139 
144  const Token& operator[](size_t lookahead) {
145  static const Token eof_;
146  while (lookahead >= tokens_.size()) {
147  if (!tokens_.empty() && tokens_.back().isEof())
148  return eof_;
149  tokens_.push_back(scanNextToken(content_, at_/*in,out*/));
150  }
151  return tokens_[lookahead];
152  }
153 
158  void consume(size_t n = 1) {
159  const Token &t = current();
160  if (t.isEof()) {
161  // void
162  } else if (n >= tokens_.size()) {
163  tokens_.clear();
164  } else {
165  tokens_.erase(tokens_.begin(), tokens_.begin() + n);
166  }
167  }
168 
182  std::string lexeme(const Token &t) {
183  if (const char *s = content_.characters(t.begin())) {
184  return std::string(s, t.end() - t.begin());
185  } else {
186  return "";
187  }
188  }
189  std::string lexeme() {
190  return lexeme(current());
191  }
200  bool isa(const Token &t, typename Token::TokenEnum type) {
201  return !t.isEof() && t.type() == type;
202  }
203 
204  bool isa(typename Token::TokenEnum type) {
205  return isa(current(), type);
206  }
217  bool match(const Token &t, const char *s) {
218  ASSERT_not_null(s);
219  size_t n1 = t.end() - t.begin();
220  size_t n2 = strlen(s);
221  if (n1 != n2)
222  return false;
223  const char *lexeme = content_.characters(t.begin());
224  return 0 == strncmp(lexeme, s, n1);
225  }
226  bool match(const char *s) {
227  return match(current(), s);
228  }
235  std::pair<size_t, size_t> location(size_t position) {
236  return content_.location(position);
237  }
238 
240  std::pair<size_t, size_t> locationEof() {
241  size_t nChars = content_.nCharacters();
242  return nChars > 0 ? content_.location(nChars-1) : content_.location(0);
243  }
244 
246  std::string lineString(size_t lineIdx) {
247  return content_.lineString(lineIdx);
248  }
249 
257  virtual Token scanNextToken(const Container::LineVector &content, size_t &at /*in,out*/) = 0;
258 };
259 
260 } // namespace
261 } // namespace
262 
263 #endif
264 
Sawyer::Optional< TokenEnum >
Sawyer::Lexer::Token::isEof
bool isEof() const
Whether this is an EOF token.
Definition: Lexer.h:56
Sawyer::Lexer::Token
Represents one token of input.
Definition: Lexer.h:28
Sawyer::Container::LineVector::nCharacters
size_t nCharacters() const
Number of characters.
Definition: LineVector.h:74
Sawyer::Lexer::TokenStream::TokenStream
TokenStream(const Container::Buffer< size_t, char >::Ptr &buffer)
Create a token stream from a buffer.
Definition: Lexer.h:118
Sawyer::Lexer::Token::end
size_t end() const
Token lexeme ending position.
Definition: Lexer.h:81
Sawyer::Lexer::TokenStream::lexeme
std::string lexeme()
Return the lexeme for a token.
Definition: Lexer.h:189
Sawyer::Container::LineVector
A buffer of characters indexed by line number.
Definition: LineVector.h:24
Sawyer::Container::LineVector::characters
const char * characters(size_t charIdx) const
Characters at file offset.
Sawyer::Container::LineVector::lineString
std::string lineString(size_t lineIdx) const
Line as a string.
Sawyer::Lexer::TokenStream::atEof
bool atEof()
Returns true if the stream is at the end.
Definition: Lexer.h:136
Sawyer::Lexer::TokenStream::isa
bool isa(typename Token::TokenEnum type)
Determine whether token is a specific type.
Definition: Lexer.h:204
Sawyer::Lexer::TokenStream
An ordered list of tokens scanned from input.
Definition: Lexer.h:91
Sawyer::Lexer::TokenStream::consume
void consume(size_t n=1)
Consume some tokens.
Definition: Lexer.h:158
Sawyer::Lexer::Token::type
TokenEnum type() const
Returns the token.
Definition: Lexer.h:65
Sawyer::Lexer::TokenStream::lineString
std::string lineString(size_t lineIdx)
Return the entire string for some line index.
Definition: Lexer.h:246
Sawyer::Lexer::TokenStream::match
bool match(const char *s)
Determine whether a token matches a string.
Definition: Lexer.h:226
Sawyer::Lexer::TokenStream::operator[]
const Token & operator[](size_t lookahead)
Return the current or future token.
Definition: Lexer.h:144
Sawyer::Lexer::TokenStream::lexeme
std::string lexeme(const Token &t)
Return the lexeme for a token.
Definition: Lexer.h:182
Sawyer::Lexer::TokenStream::current
const Token & current()
Return the current token.
Definition: Lexer.h:129
Sawyer::SharedPointer
Reference-counting intrusive smart pointer.
Definition: SharedPointer.h:68
Sawyer
Name space for the entire library.
Definition: Access.h:13
Sawyer::Lexer::TokenStream::isa
bool isa(const Token &t, typename Token::TokenEnum type)
Determine whether token is a specific type.
Definition: Lexer.h:200
Sawyer::Lexer::TokenStream::name
const std::string & name() const
Property: Name of stream.
Definition: Lexer.h:122
Sawyer::Lexer::TokenStream::location
std::pair< size_t, size_t > location(size_t position)
Return the line number and offset for an input position.
Definition: Lexer.h:235
Sawyer::Lexer::Token::begin
size_t begin() const
Token lexeme starting position.
Definition: Lexer.h:72
Sawyer::Lexer::TokenStream::TokenStream
TokenStream(const std::string &inputString)
Create a token stream from a string.
Definition: Lexer.h:112
Sawyer::Lexer::TokenStream::TokenStream
TokenStream(const boost::filesystem::path &fileName)
Create a token stream from the contents of a file.
Definition: Lexer.h:105
Sawyer::Lexer::Token::Token
Token(TokenEnum type, size_t begin, size_t end)
Construct a token.
Definition: Lexer.h:47
Sawyer::Lexer::Token::Token
Token()
Construct an EOF token.
Definition: Lexer.h:39
Sawyer::Lexer::TokenStream::locationEof
std::pair< size_t, size_t > locationEof()
Returns the last line index and character offset.
Definition: Lexer.h:240
Sawyer::Container::LineVector::location
std::pair< size_t, size_t > location(size_t charIndex) const
Convert a character index to a line and column index.
Sawyer::Lexer::TokenStream::match
bool match(const Token &t, const char *s)
Determine whether a token matches a string.
Definition: Lexer.h:217