LCOV - code coverage report
Current view: top level - home/yyan7/compiler/rexompiler/src/util/Sawyer - Lexer.h (source / functions) Hit Total Coverage
Test: ROSE Lines: 0 51 0.0 %
Date: 2022-12-08 13:48:47 Functions: 0 10 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // WARNING: Changes to this file must be contributed back to Sawyer or else they will
       2             : //          be clobbered by the next update from Sawyer.  The Sawyer repository is at
       3             : //          https://github.com/matzke1/sawyer.
       4             : 
       5             : 
       6             : 
       7             : 
       8             : #ifndef Sawyer_Lexer_H
       9             : #define Sawyer_Lexer_H
      10             : 
      11             : #include <Sawyer/AllocatingBuffer.h>
      12             : #include <Sawyer/LineVector.h>
      13             : #include <Sawyer/Optional.h>
      14             : #include <Sawyer/Sawyer.h>
      15             : 
      16             : #include <boost/filesystem.hpp>
      17             : 
      18             : namespace Sawyer {
      19             : 
      20             : namespace Lexer {
      21             : 
      22             : /** Represents one token of input.
      23             :  *
      24             :  *  Each token has a user-defined type which is some enumerated constant, or no type at all which means the token represents
      25             :  *  the end of the input stream.  Tokens do not store their own lexeme, but rather point to the beginning and end of their
      26             :  *  lexeme in the input stream. */
      27             : template<typename T>
      28           0 : class SAWYER_EXPORT Token {
      29             : public:
      30             :     typedef T TokenEnum;
      31             : 
      32             : private:
      33             :     Optional<TokenEnum> type_;                          // empty means EOF
      34             :     size_t begin_;                                      // beginning character position in the input
      35             :     size_t end_;                                        // ending (exclusive) character position in the input
      36             : 
      37             : public:
      38             :     /** Construct an EOF token. */
      39           0 :     Token(): begin_(0), end_(0) {}
      40             : 
      41             :     /** Construct a token.
      42             :      *
      43             :      *  The token has the specified type and its lexeme are the characters at the specified position in the input stream. The
      44             :      *  @p begin points to the first character of the lexeme and @p end points to one past the last character.  A token's
      45             :      *  lexeme is allowed to be the empty string by specifying the same value for @p begin and @p end, however, @p end must
      46             :      *  never be less than @p begin. */
      47           0 :     Token(TokenEnum type, size_t begin, size_t end)
      48           0 :         : type_(type), begin_(begin), end_(end) {
      49           0 :         ASSERT_require(end >= begin);
      50           0 :     }
      51             : 
      52             :     /** Whether this is an EOF token.
      53             :      *
      54             :      *  An EOF token is a special token that has no type and an empty lexeme.  EOF tokens are constructed by the default
      55             :      *  constructor. */
      56           0 :     bool isEof() const {
      57           0 :         if (type_)
      58             :             return false;
      59             :         return true;
      60             :     }
      61             : 
      62             :     /** Returns the token.
      63             :      *
      64             :      *  Since EOF tokens have no type, this must not be called for an EOF token. */
      65           0 :     TokenEnum type() const {
      66           0 :         return *type_;
      67             :     }
      68             : 
      69             :     /** Token lexeme starting position.
      70             :      *
      71             :      *  This is the starting offset in the input for the first character of this token's lexeme. */
      72           0 :     size_t begin() const {
      73             :         return begin_;
      74             :     }
      75             : 
      76             :     /** Token lexeme ending position.
      77             :      *
      78             :      *  This is the offset in the input for one position past the last character of this token's lexeme. It is guaranteed to be
      79             :      *  greater than or equal to the @ref begin position. EOF tokens will always have a @ref begin equal to the @ref end, but
      80             :      *  other empty non-EOF tokens are also possible. */
      81           0 :     size_t end() const {
      82             :         return end_;
      83             :     }
      84             : };
      85             : 
      86             : /** An ordered list of tokens scanned from input.
      87             :  *
      88             :  *  A token stream is an ordered list of tokens scanned from an unchanging input stream and consumed in the order they're
      89             :  *  produced. */
      90             : template<class T>
      91             : class SAWYER_EXPORT TokenStream {
      92             : public:
      93             :     typedef T Token;
      94             : 
      95             : private:
      96             :     std::string name_;                                  // name of stream (e.g., a file name)
      97             :     Container::LineVector content_;                     // line-oriented character contents of the stream
      98             :     size_t at_;                                         // cursor position in stream
      99             :     std::vector<Token> tokens_;                         // current token at [0] plus lookahead
     100             : 
     101             : public:
     102           0 :     virtual ~TokenStream() {}
     103             : 
     104             :     /** Create a token stream from the contents of a file. */
     105             :     explicit TokenStream(const boost::filesystem::path &fileName)
     106             :         : name_(fileName.string()), content_(fileName.string()), at_(0) {}
     107             : 
     108             :     /** Create a token stream from a string.
     109             :      *
     110             :      *  The string content is copied into the lexer and thus can be modified after the lexer returns without affecting the
     111             :      *  token stream. */
     112           0 :     explicit TokenStream(const std::string &inputString)
     113           0 :         : name_("string"), content_(Container::AllocatingBuffer<size_t, char>::instance(inputString)), at_(0) {}
     114             : 
     115             :     /** Create a token stream from a buffer.
     116             :      *
     117             :      *  The token stream uses the specified buffer, which should not be modified while the token stream is alive. */
     118             :     explicit TokenStream(const Container::Buffer<size_t, char>::Ptr &buffer)
     119             :         : name_("string"), content_(buffer), at_(0) {}
     120             : 
     121             :     /** Property: Name of stream. */
     122           0 :     const std::string& name() const {
     123           0 :         return name_;
     124             :     }
     125             : 
     126             :     /** Return the current token.
     127             :      *
     128             :      *  The current token will be an EOF token when all tokens are consumed. */
     129           0 :     const Token& current() {
     130           0 :         return (*this)[0];
     131             :     }
     132             : 
     133             :     /** Returns true if the stream is at the end.
     134             :      *
     135             :      *  This is equivalent to obtaining the current toking and checking whether it's the EOF token. */
     136           0 :     bool atEof() {
     137           0 :         return current().isEof();
     138             :     }
     139             : 
     140             :     /** Return the current or future token.
     141             :      *
     142             :      *  The array operator obtains a token from a virtual array whose first element is the current token, second element is one
     143             :      *  past the current token, etc.  The array is infinite in length, padded with EOF tokens. */
     144           0 :     const Token& operator[](size_t lookahead) {
     145           0 :         static const Token eof_;
     146           0 :         while (lookahead >= tokens_.size()) {
     147           0 :             if (!tokens_.empty() && tokens_.back().isEof())
     148             :                 return eof_;
     149           0 :             tokens_.push_back(scanNextToken(content_, at_/*in,out*/));
     150             :         }
     151           0 :         return tokens_[lookahead];
     152             :     }
     153             : 
     154             :     /** Consume some tokens.
     155             :      *
     156             :      *  Consumes tokens by shifting @p n tokens off the low-end of the virtual array of tokens. It is permissible to consume
     157             :      *  EOF tokens since more will be generated once the end-of-input is reached. */
     158           0 :     void consume(size_t n = 1) {
     159           0 :         const Token &t = current();
     160           0 :         if (t.isEof()) {
     161             :             // void
     162           0 :         } else if (n >= tokens_.size()) {
     163           0 :             tokens_.clear();
     164             :         } else {
     165           0 :             tokens_.erase(tokens_.begin(), tokens_.begin() + n);
     166             :         }
     167           0 :     }
     168             : 
     169             :     /** Return the lexeme for a token.
     170             :      *
     171             :      *  Consults the input stream to obtain the lexeme for the specified token and converts that part of the stream to a string
     172             :      *  which is returned.  The lexeme for an EOF token is an empty string, although other tokens might also have empty
     173             :      *  lexemes.  One may query the lexeme for any token regardless of whether it's been consumed; in fact, one can even query
     174             :      *  lexemes for tokens that have never even been seen by the token stream.
     175             :      *
     176             :      *  The no-argument version returns the lexeme of the current token.
     177             :      *
     178             :      *  If you're trying to build a fast lexical analyzer, don't call this function to compare a lexeme against some known
     179             :      *  string. Instead, use @ref match, which doesn't require copying.
     180             :      *
     181             :      *  @{ */
     182           0 :     std::string lexeme(const Token &t) {
     183           0 :         if (const char *s = content_.characters(t.begin())) {
     184           0 :             return std::string(s, t.end() - t.begin());
     185             :         } else {
     186           0 :             return "";
     187             :         }
     188             :     }
     189           0 :     std::string lexeme() {
     190           0 :         return lexeme(current());
     191             :     }
     192             :     /** @} */
     193             : 
     194             :     /** Determine whether token is a specific type.
     195             :      *
     196             :      *  This is sometimes easier to call since it gracefully handles EOF tokens.  If called with only one argument, the desired
     197             :      *  type, then it checks the current token.
     198             :      *
     199             :      * @{ */
     200           0 :     bool isa(const Token &t, typename Token::TokenEnum type) {
     201           0 :         return !t.isEof() && t.type() == type;
     202             :     }
     203             : 
     204           0 :     bool isa(typename Token::TokenEnum type) {
     205           0 :         return isa(current(), type);
     206             :     }
     207             :     /** @} */
     208             : 
     209             :     /** Determine whether a token matches a string.
     210             :      *
     211             :      *  Compares the specified string to a token's lexeme and returns true if they are the same. This is faster than obtaining
     212             :      *  the lexeme from a token and comparing to a string since there's no string copying involved with this function.
     213             :      *
     214             :      *  The no-argument version compares the string with the current tokens' lexeme.
     215             :      *
     216             :      * @{ */
     217             :     bool match(const Token &t, const char *s) {
     218             :         ASSERT_not_null(s);
     219             :         size_t n1 = t.end() - t.begin();
     220             :         size_t n2 = strlen(s);
     221             :         if (n1 != n2)
     222             :             return false;
     223             :         const char *lexeme = content_.characters(t.begin());
     224             :         return 0 == strncmp(lexeme, s, n1);
     225             :     }
     226             :     bool match(const char *s) {
     227             :         return match(current(), s);
     228             :     }
     229             :     /** @} */
     230             :     
     231             :     /** Return the line number and offset for an input position.
     232             :      *
     233             :      *  Returns the zero-origin line number (a.k.a., line index) for the line containing the specified character position, and
     234             :      *  the offset of that character with respect to the beginning of the line. */
     235           0 :     std::pair<size_t, size_t> location(size_t position) {
     236           0 :         return content_.location(position);
     237             :     }
     238             : 
     239             :     /** Returns the last line index and character offset. */
     240           0 :     std::pair<size_t, size_t> locationEof() {
     241           0 :         size_t nChars = content_.nCharacters();
     242           0 :         return nChars > 0 ? content_.location(nChars-1) : content_.location(0);
     243             :     }
     244             :     
     245             :     /** Return the entire string for some line index. */
     246           0 :     std::string lineString(size_t lineIdx) {
     247           0 :         return content_.lineString(lineIdx);
     248             :     }
     249             : 
     250             :     /** Function that obtains the next token.
     251             :      *
     252             :      *  Subclasses implement this function to obtain the next token that starts at or after the specified input position. Upon
     253             :      *  return, the function should adjust @p at to point to the next position for scanning a token, which is usually the first
     254             :      *  character after the returned token's lexeme. If the scanner reaches the end of input or any condition that it deems to
     255             :      *  be the end then it should return the EOF token (a default-constructed token), after which this function will not be
     256             :      *  called again. */
     257             :     virtual Token scanNextToken(const Container::LineVector &content, size_t &at /*in,out*/) = 0;
     258             : };
     259             : 
     260             : } // namespace
     261             : } // namespace
     262             : 
     263             : #endif
     264             : 

Generated by: LCOV version 1.14