llvm · cmtice · Feb 5, 2025 · Jan 19, 2025 · Jan 19, 2025 · Jan 26, 2025
@@ -0,0 +1,158 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
-#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
-#define LLDB_VALUEOBJECT_DILLEXER_H_
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H
+#define LLDB_VALUEOBJECT_DILLEXER_H
-#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
-#define LLDB_VALUEOBJECT_DILLEXER_H_
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H
+#define LLDB_VALUEOBJECT_DILLEXER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <limits.h>
-#include <limits.h>
+#include <climits>
-#include <limits.h>
+#include <climits>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private {
+
+namespace dil {
-namespace lldb_private {
-
-namespace dil {
+namespace lldb_private::dil {
-namespace lldb_private {
-
-namespace dil {
+namespace lldb_private::dil {
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+public:
+  enum Kind {
+    coloncolon,
+    eof,
+    identifier,
+    invalid,
+    kw_namespace,
+    l_paren,
+    none,
+    r_paren,
+    unknown,
+  };
+
+  Token(Kind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
-      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+      : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
-      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+      : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
+
+  Token() : m_kind(Kind::none), m_spelling(""), m_start_pos(0) {}
+
+  void SetKind(Kind kind) { m_kind = kind; }
+
+  Kind GetKind() const { return m_kind; }
+
+  std::string GetSpelling() const { return m_spelling; }
+
+  uint32_t GetLength() const { return m_spelling.size(); }
+
+  bool Is(Kind kind) const { return m_kind == kind; }
+
+  bool IsNot(Kind kind) const { return m_kind != kind; }
+
+  bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }
+
+  template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
+    return Is(kind) || IsOneOf(Ks...);
+  }
+
+  uint32_t GetLocation() const { return m_start_pos; }
+
+  static llvm::StringRef GetTokenName(Kind kind);
+
+private:
+  Kind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr) {
+    m_cur_pos = m_expr.begin();
+    // Use UINT_MAX to indicate invalid/uninitialized value.
+    m_tokens_idx = UINT_MAX;
+    m_invalid_token = Token(Token::invalid, "", 0);
+  }
+
+  llvm::Expected<bool> LexAll();
+
+  /// Return the lexed token N+1 positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const Token &LookAhead(uint32_t N);
+
+  const Token &AcceptLookAhead(uint32_t N);
+
+  const Token &GetNextToken();
+
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+  /// Return the current token to be handled by the DIL parser.
+  const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+  uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+
+  /// Update the index for the 'current' token, to point to the next lexed
+  /// token.
+  bool IncrementTokenIdx() {
+    if (m_tokens_idx >= m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx++;
+    return true;
+  }
+
+  /// Set the index for the 'current' token (to be handled by the parser)
+  /// to a particular position. Used for either committing 'look ahead' parsing
+  /// or rolling back tentative parsing.
+  bool ResetTokenIdx(uint32_t new_value) {
+    if (new_value > m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx = new_value;
+    return true;
+  }
-  bool ResetTokenIdx(uint32_t new_value) {
-    if (new_value > m_lexed_tokens.size() - 1)
-      return false;
-
-    m_tokens_idx = new_value;
-    return true;
-  }
+  void ResetTokenIdx(uint32_t new_value) {
+    assert(new_value < m_lexed_tokens.size());
+    m_tokens_idx = new_value;
+  }
-  bool ResetTokenIdx(uint32_t new_value) {
-    if (new_value > m_lexed_tokens.size() - 1)
-      return false;
-
-    m_tokens_idx = new_value;
-    return true;
-  }
+  void ResetTokenIdx(uint32_t new_value) {
+    assert(new_value < m_lexed_tokens.size());
+    m_tokens_idx = new_value;
+  }
+
+  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+
+private:
+  llvm::Expected<Token> Lex();
+
+  llvm::iterator_range<llvm::StringRef::iterator> IsWord();
+
+  /// Update 'result' with the other paremeter values, create a
+  /// duplicate token, and push the duplicate token onto the vector of
+  /// lexed tokens.
+  void UpdateLexedTokens(Token &result, Token::Kind tok_kind,
+                         std::string tok_str, uint32_t tok_pos);
+
+  // The input string we are lexing & parsing.
+  llvm::StringRef m_expr;
+
+  // The current position of the lexer within m_expr (the character position,
+  // within the string, of the next item to be lexed).
+  llvm::StringRef::iterator m_cur_pos;
+
+  // Holds all of the tokens lexed so far.
+  std::vector<Token> m_lexed_tokens;
+
+  // Index into m_lexed_tokens; indicates which token the DIL parser is
+  // currently trying to parse/handle.
+  uint32_t m_tokens_idx;
+
+  // "invalid" token; to be returned by lexer when 'look ahead' fails.
+  Token m_invalid_token;
+};
+
+} // namespace dil
+
+} // namespace lldb_private
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H_
@@ -1,4 +1,5 @@
 add_lldb_library(lldbValueObject
+  DILLexer.cpp
   ValueObject.cpp
   ValueObjectCast.cpp
   ValueObjectChild.cpp

@@ -0,0 +1,189 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+llvm::StringRef Token::GetTokenName(Kind kind) {
+  switch (kind) {
+  case Kind::coloncolon:
+    return "coloncolon";
+  case Kind::eof:
+    return "eof";
+  case Kind::identifier:
+    return "identifier";
+  case Kind::invalid:
+    return "invalid";
+  case Kind::kw_namespace:
+    return "namespace";
+  case Kind::l_paren:
+    return "l_paren";
+  case Kind::none:
+    return "none";
+  case Kind::r_paren:
+    return "r_paren";
+  case Kind::unknown:
+    return "unknown";
+  }
+}
+
+static bool IsLetter(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+llvm::iterator_range<llvm::StringRef::iterator> DILLexer::IsWord() {
+  llvm::StringRef::iterator start = m_cur_pos;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (m_cur_pos == m_expr.end() || IsDigit(*m_cur_pos))
+    return llvm::make_range(m_cur_pos, m_cur_pos);
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*m_cur_pos == '$') {
+    dollar_start = true;
+    ++m_cur_pos;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; m_cur_pos != m_expr.end(); ++m_cur_pos) {
+    char c = *m_cur_pos;
+    if (!IsLetter(c) && !IsDigit(c) && c != '_')
+      break;
+  }
+
+  // If first char is '$', make sure there's at least one mare char, or it's
+  // invalid.
+  if (dollar_start && (m_cur_pos - start <= 1)) {
+    m_cur_pos = start;
+    return llvm::make_range(start, start); // Empty range
+  }
+
+  return llvm::make_range(start, m_cur_pos);
+}
+
+void DILLexer::UpdateLexedTokens(Token &result, Token::Kind tok_kind,
+                                 std::string tok_str, uint32_t tok_pos) {
+  Token new_token(tok_kind, tok_str, tok_pos);
+  result = new_token;
+  m_lexed_tokens.push_back(std::move(new_token));
+}
+
+llvm::Expected<bool> DILLexer::LexAll() {
+  bool done = false;
+  while (!done) {
+    auto tok_or_err = Lex();
+    if (!tok_or_err)
+      return tok_or_err.takeError();
+    Token token = *tok_or_err;
+    if (token.GetKind() == Token::eof) {
+      done = true;
+    }
+  }
+  return true;
+}
+
+llvm::Expected<Token> DILLexer::Lex() {
+  Token result;
+
+  // Skip over whitespace (spaces).
+  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
+    m_cur_pos++;
+
+  // Check to see if we've reached the end of our input string.
+  if (m_cur_pos == m_expr.end()) {
+    UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size());
+    return result;
+  }
+
+  uint32_t position = m_cur_pos - m_expr.begin();
+  llvm::StringRef::iterator start = m_cur_pos;
+  llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord();
+  if (!word_range.empty()) {
+    uint32_t length = word_range.end() - word_range.begin();
+    llvm::StringRef word(m_expr.substr(position, length));
+    // We will be adding more keywords here in the future...
+    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
+                           .Case("namespace", Token::kw_namespace)
+                           .Default(Token::identifier);
+    UpdateLexedTokens(result, kind, word.str(), position);
+    return result;
+  }
+
+  m_cur_pos = start;
+  llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos));
+  std::vector<std::pair<Token::Kind, const char *>> operators = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str)) {
+      m_cur_pos += strlen(str);
+      UpdateLexedTokens(result, kind, str, position);
+      return result;
+    }
+  }
+
+  // Unrecognized character(s) in string; unable to lex it.
+  Status error("Unable to lex input string");
+  return error.ToError();
+}
-llvm::Expected<bool> DILLexer::LexAll() {
-  bool done = false;
-  while (!done) {
-    auto tok_or_err = Lex();
-    if (!tok_or_err)
-      return tok_or_err.takeError();
-    Token token = *tok_or_err;
-    if (token.GetKind() == Token::eof) {
-      done = true;
-    }
-  }
-  return true;
-}
-
-llvm::Expected<Token> DILLexer::Lex() {
-  Token result;
-
-  // Skip over whitespace (spaces).
-  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
-    m_cur_pos++;
-
-  // Check to see if we've reached the end of our input string.
-  if (m_cur_pos == m_expr.end()) {
-    UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size());
-    return result;
-  }
-
-  uint32_t position = m_cur_pos - m_expr.begin();
-  llvm::StringRef::iterator start = m_cur_pos;
-  llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord();
-  if (!word_range.empty()) {
-    uint32_t length = word_range.end() - word_range.begin();
-    llvm::StringRef word(m_expr.substr(position, length));
-    // We will be adding more keywords here in the future...
-    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
-                           .Case("namespace", Token::kw_namespace)
-                           .Default(Token::identifier);
-    UpdateLexedTokens(result, kind, word.str(), position);
-    return result;
-  }
-
-  m_cur_pos = start;
-  llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos));
-  std::vector<std::pair<Token::Kind, const char *>> operators = {
-      {Token::l_paren, "("},
-      {Token::r_paren, ")"},
-      {Token::coloncolon, "::"},
-  };
-  for (auto [kind, str] : operators) {
-    if (remainder.consume_front(str)) {
-      m_cur_pos += strlen(str);
-      UpdateLexedTokens(result, kind, str, position);
-      return result;
-    }
-  }
-
-  // Unrecognized character(s) in string; unable to lex it.
-  Status error("Unable to lex input string");
-  return error.ToError();
-}
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+  std::vector<Token> tokens;
+  llvm::StringRef remainder = expr;
+  do {
+    if (llvm::Expected<Token> t = Lex(expr, remainder))
+      tokens.push_back(std:move(*t);
+    else
+     return t.takeError();
+  } while (tokens.back().GetKind() != Token::eof);
+  return DILLexer(std::move(tokens)); // calling a private constructor
+}
+
+static llvm::Expected<Token> Lex(llvm::StringRef expr, llvm::StringRef &remainder) {
+  // Skip over whitespace.
+  remainder = remainder.ltrim();
+
+  size_t position = remainder.data()-expr.data();
+  // Check to see if we've reached the end of our input string.
+  if (remainder.empty())
+    return Token(Token::eof, 
+  if (m_cur_pos == m_expr.end())
+    return Token(Token::eof, "", position);
+
+  llvm::StringRef word = IsWord(remainder); // automatically updates `remainder`, you may be able to use things like `StringRef::drop_while` in the implementation
+  if (!word_range.empty()) {
+    // We will be adding more keywords here in the future...
+    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
+                           .Case("namespace", Token::kw_namespace)
+                           .Default(Token::identifier);
+    return Token(kind, word.str(), position);
+  }
+
+  std::vector<std::pair<Token::Kind, const char *>> operators = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str))
+      return Token(kind, str, position);
+  }
+
+  return llvm::createStringError("Unable to lex input string");
+}
-llvm::Expected<bool> DILLexer::LexAll() {
-  bool done = false;
-  while (!done) {
-    auto tok_or_err = Lex();
-    if (!tok_or_err)
-      return tok_or_err.takeError();
-    Token token = *tok_or_err;
-    if (token.GetKind() == Token::eof) {
-      done = true;
-    }
-  }
-  return true;
-}
-
-llvm::Expected<Token> DILLexer::Lex() {
-  Token result;
-
-  // Skip over whitespace (spaces).
-  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
-    m_cur_pos++;
-
-  // Check to see if we've reached the end of our input string.
-  if (m_cur_pos == m_expr.end()) {
-    UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size());
-    return result;
-  }
-
-  uint32_t position = m_cur_pos - m_expr.begin();
-  llvm::StringRef::iterator start = m_cur_pos;
-  llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord();
-  if (!word_range.empty()) {
-    uint32_t length = word_range.end() - word_range.begin();
-    llvm::StringRef word(m_expr.substr(position, length));
-    // We will be adding more keywords here in the future...
-    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
-                           .Case("namespace", Token::kw_namespace)
-                           .Default(Token::identifier);
-    UpdateLexedTokens(result, kind, word.str(), position);
-    return result;
-  }
-
-  m_cur_pos = start;
-  llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos));
-  std::vector<std::pair<Token::Kind, const char *>> operators = {
-      {Token::l_paren, "("},
-      {Token::r_paren, ")"},
-      {Token::coloncolon, "::"},
-  };
-  for (auto [kind, str] : operators) {
-    if (remainder.consume_front(str)) {
-      m_cur_pos += strlen(str);
-      UpdateLexedTokens(result, kind, str, position);
-      return result;
-    }
-  }
-
-  // Unrecognized character(s) in string; unable to lex it.
-  Status error("Unable to lex input string");
-  return error.ToError();
-}
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+  std::vector<Token> tokens;
+  llvm::StringRef remainder = expr;
+  do {
+    if (llvm::Expected<Token> t = Lex(expr, remainder))
+      tokens.push_back(std:move(*t);
+    else
+     return t.takeError();
+  } while (tokens.back().GetKind() != Token::eof);
+  return DILLexer(std::move(tokens)); // calling a private constructor
+}
+
+static llvm::Expected<Token> Lex(llvm::StringRef expr, llvm::StringRef &remainder) {
+  // Skip over whitespace.
+  remainder = remainder.ltrim();
+
+  size_t position = remainder.data()-expr.data();
+  // Check to see if we've reached the end of our input string.
+  if (remainder.empty())
+    return Token(Token::eof, 
+  if (m_cur_pos == m_expr.end())
+    return Token(Token::eof, "", position);
+
+  llvm::StringRef word = IsWord(remainder); // automatically updates `remainder`, you may be able to use things like `StringRef::drop_while` in the implementation
+  if (!word_range.empty()) {
+    // We will be adding more keywords here in the future...
+    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
+                           .Case("namespace", Token::kw_namespace)
+                           .Default(Token::identifier);
+    return Token(kind, word.str(), position);
+  }
+
+  std::vector<std::pair<Token::Kind, const char *>> operators = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str))
+      return Token(kind, str, position);
+  }
+
+  return llvm::createStringError("Unable to lex input string");
+}
+
+const Token &DILLexer::LookAhead(uint32_t N) {
+  if (m_tokens_idx + N + 1 < m_lexed_tokens.size())
+    return m_lexed_tokens[m_tokens_idx + N + 1];
+
+  return m_invalid_token;
+}
+
+const Token &DILLexer::AcceptLookAhead(uint32_t N) {
+  if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
+    return m_invalid_token;
+
+  m_tokens_idx += N + 1;
+  return m_lexed_tokens[m_tokens_idx];
+}
+
+const Token &DILLexer::GetNextToken() {
+  if (m_tokens_idx == UINT_MAX)
+    m_tokens_idx = 0;
+  else
+    m_tokens_idx++;
+
+  // Return the next token in the vector of lexed tokens.
+  if (m_tokens_idx < m_lexed_tokens.size())
+    return m_lexed_tokens[m_tokens_idx];
+
+  // We're already at/beyond the end of our lexed tokens. If the last token
+  // is an eof token, return it.
+  if (m_lexed_tokens[m_lexed_tokens.size() - 1].GetKind() == Token::eof)
+    return m_lexed_tokens[m_lexed_tokens.size() - 1];
+
+  // Return the invalid token.
+  return m_invalid_token;
+}
+
+} // namespace dil
+
+} // namespace lldb_private
@@ -1,5 +1,6 @@
 add_lldb_unittest(LLDBValueObjectTests
   DumpValueObjectOptionsTests.cpp
+  DILLexerTests.cpp
 
   LINK_LIBS
     lldbValueObject