[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). #123521

cmtice · 2025-01-19T17:21:45Z

This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see
https://discourse.llvm.org/t/rfc-data-inspection-language/69893

This version of the lexer only handles local variables and namespaces, and is designed to work with
#120971.

This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.

llvmbot · 2025-01-19T17:22:19Z

@llvm/pr-subscribers-lldb

Author: None (cmtice)

Changes

This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see
https://discourse.llvm.org/t/rfc-data-inspection-language/69893

This version of the lexer only handles local variables and namespaces, and is designed to work with
#120971.

Full diff: https://github.com/llvm/llvm-project/pull/123521.diff

4 Files Affected:

(added) lldb/include/lldb/ValueObject/DILLexer.h (+156)
(added) lldb/source/ValueObject/DILLexer.cpp (+205)
(modified) lldb/unittests/ValueObject/CMakeLists.txt (+1)
(added) lldb/unittests/ValueObject/DILLexerTests.cpp (+193)

diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
new file mode 100644
index 00000000000000..45c506b2f4106d
--- /dev/null
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -0,0 +1,156 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <limits.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private {
+
+namespace dil {
+
+enum class TokenKind {
+  coloncolon,
+  eof,
+  identifier,
+  invalid,
+  kw_namespace,
+  l_paren,
+  none,
+  r_paren,
+  unknown,
+};
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class DILToken {
+public:
+  DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+
+  DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}
+
+  void setKind(dil::TokenKind kind) { m_kind = kind; }
+  dil::TokenKind getKind() const { return m_kind; }
+
+  std::string getSpelling() const { return m_spelling; }
+
+  uint32_t getLength() const { return m_spelling.size(); }
+
+  bool is(dil::TokenKind kind) const { return m_kind == kind; }
+
+  bool isNot(dil::TokenKind kind) const { return m_kind != kind; }
+
+  bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
+    return is(kind1) || is(kind2);
+  }
+
+  template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
+    return is(kind) || isOneOf(Ks...);
+  }
+
+  uint32_t getLocation() const { return m_start_pos; }
+
+  void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
+    m_kind = kind;
+    m_spelling = spelling;
+    m_start_pos = start;
+  }
+
+  static const std::string getTokenName(dil::TokenKind kind);
+
+private:
+  dil::TokenKind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
+    m_cur_pos = m_expr.begin();
+    // Use UINT_MAX to indicate invalid/uninitialized value.
+    m_tokens_idx = UINT_MAX;
+  }
+
+  bool Lex(DILToken &result, bool look_ahead = false);
+
+  bool Is_Word(std::string::iterator start, uint32_t &length);
+
+  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+
+  /// Update 'result' with the other paremeter values, create a
+  /// duplicate token, and push the duplicate token onto the vector of
+  /// lexed tokens.
+  void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+                         std::string tok_str, uint32_t tok_pos);
+
+  /// Return the lexed token N+1 positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const DILToken &LookAhead(uint32_t N);
+
+  const DILToken &AcceptLookAhead(uint32_t N);
+
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+  /// Return the current token to be handled by the DIL parser.
+  DILToken &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+  /// Update the index for the 'current' token, to point to the next lexed
+  /// token.
+  bool IncrementTokenIdx() {
+    if (m_tokens_idx >= m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx++;
+    return true;
+  }
+
+  /// Set the index for the 'current' token (to be handled by the parser)
+  /// to a particular position. Used for either committing 'look ahead' parsing
+  /// or rolling back tentative parsing.
+  bool ResetTokenIdx(uint32_t new_value) {
+    if (new_value > m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx = new_value;
+    return true;
+  }
+
+private:
+  // The input string we are lexing & parsing.
+  std::string m_expr;
+
+  // The current position of the lexer within m_expr (the character position,
+  // within the string, of the next item to be lexed).
+  std::string::iterator m_cur_pos;
+
+  // Holds all of the tokens lexed so far.
+  std::vector<DILToken> m_lexed_tokens;
+
+  // Index into m_lexed_tokens; indicates which token the DIL parser is
+  // currently trying to parse/handle.
+  uint32_t m_tokens_idx;
+
+  // "invalid" token; to be returned by lexer when 'look ahead' fails.
+  DILToken m_invalid_token;
+};
+
+} // namespace dil
+
+} // namespace lldb_private
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H_
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
new file mode 100644
index 00000000000000..4c2b0b1813bb96
--- /dev/null
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -0,0 +1,205 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+// For fast keyword lookup. More keywords will be added later.
+const llvm::StringMap<dil::TokenKind> Keywords = {
+    {"namespace", dil::TokenKind::kw_namespace},
+};
+
+const std::string DILToken::getTokenName(dil::TokenKind kind) {
+  switch (kind) {
+  case dil::TokenKind::coloncolon:
+    return "coloncolon";
+  case dil::TokenKind::eof:
+    return "eof";
+  case dil::TokenKind::identifier:
+    return "identifier";
+  case dil::TokenKind::kw_namespace:
+    return "namespace";
+  case dil::TokenKind::l_paren:
+    return "l_paren";
+  case dil::TokenKind::r_paren:
+    return "r_paren";
+  case dil::TokenKind::unknown:
+    return "unknown";
+  default:
+    return "token_name";
+  }
+}
+
+static bool Is_Letter(char c) {
+  if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+    return true;
+  return false;
+}
+
+static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) {
+  bool done = false;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos))
+    return false;
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*m_cur_pos == '$') {
+    dollar_start = true;
+    ++m_cur_pos;
+    length++;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) {
+    char c = *m_cur_pos;
+    if (!Is_Letter(c) && !Is_Digit(c) && c != '_') {
+      done = true;
+      break;
+    } else
+      length++;
+  }
+
+  if (dollar_start && length > 1) // Must have something besides just '$'
+    return true;
+
+  if (!dollar_start && length > 0)
+    return true;
+
+  // Not a valid word, so re-set the lexing position.
+  m_cur_pos = start;
+  return false;
+}
+
+void DILLexer::UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+                                 std::string tok_str, uint32_t tok_pos) {
+  DILToken new_token;
+  result.setValues(tok_kind, tok_str, tok_pos);
+  new_token = result;
+  m_lexed_tokens.push_back(std::move(new_token));
+}
+
+bool DILLexer::Lex(DILToken &result, bool look_ahead) {
+  bool retval = true;
+
+  if (!look_ahead) {
+    // We're being asked for the 'next' token, and not a part of a LookAhead.
+    // Check to see if we've already lexed it and pushed it onto our tokens
+    // vector; if so, return the next token from the vector, rather than doing
+    // more lexing.
+    if ((m_tokens_idx != UINT_MAX) &&
+        (m_tokens_idx < m_lexed_tokens.size() - 1)) {
+      result = m_lexed_tokens[m_tokens_idx + 1];
+      return retval;
+    }
+  }
+
+  // Skip over whitespace (spaces).
+  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
+    m_cur_pos++;
+
+  // Check to see if we've reached the end of our input string.
+  if (m_cur_pos == m_expr.end()) {
+    UpdateLexedTokens(result, dil::TokenKind::eof, "", m_expr.length());
+    return retval;
+  }
+
+  uint32_t position = m_cur_pos - m_expr.begin();
+  ;
+  std::string::iterator start = m_cur_pos;
+  uint32_t length = 0;
+  if (Is_Word(start, length)) {
+    dil::TokenKind kind;
+    std::string word = m_expr.substr(position, length);
+    auto iter = Keywords.find(word);
+    if (iter != Keywords.end())
+      kind = iter->second;
+    else
+      kind = dil::TokenKind::identifier;
+
+    UpdateLexedTokens(result, kind, word, position);
+    return true;
+  }
+
+  switch (*m_cur_pos) {
+  case '(':
+    m_cur_pos++;
+    UpdateLexedTokens(result, dil::TokenKind::l_paren, "(", position);
+    return true;
+  case ')':
+    m_cur_pos++;
+    UpdateLexedTokens(result, dil::TokenKind::r_paren, ")", position);
+    return true;
+  case ':':
+    if (position + 1 < m_expr.size() && m_expr[position + 1] == ':') {
+      m_cur_pos += 2;
+      UpdateLexedTokens(result, dil::TokenKind::coloncolon, "::", position);
+      return true;
+    }
+    break;
+  default:
+    break;
+  }
+  // Empty Token
+  result.setValues(dil::TokenKind::none, "", m_expr.length());
+  return false;
+}
+
+const DILToken &DILLexer::LookAhead(uint32_t N) {
+  uint32_t extra_lexed_tokens = m_lexed_tokens.size() - m_tokens_idx - 1;
+
+  if (N + 1 < extra_lexed_tokens)
+    return m_lexed_tokens[m_tokens_idx + N + 1];
+
+  uint32_t remaining_tokens =
+      (m_tokens_idx + N + 1) - m_lexed_tokens.size() + 1;
+
+  bool done = false;
+  bool look_ahead = true;
+  while (!done && remaining_tokens > 0) {
+    DILToken tok;
+    Lex(tok, look_ahead);
+    if (tok.getKind() == dil::TokenKind::eof)
+      done = true;
+    remaining_tokens--;
+  };
+
+  if (remaining_tokens > 0) {
+    m_invalid_token.setValues(dil::TokenKind::invalid, "", 0);
+    return m_invalid_token;
+  }
+
+  return m_lexed_tokens[m_tokens_idx + N + 1];
+}
+
+const DILToken &DILLexer::AcceptLookAhead(uint32_t N) {
+  if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
+    return m_invalid_token;
+
+  m_tokens_idx += N + 1;
+  return m_lexed_tokens[m_tokens_idx];
+}
+
+} // namespace dil
+
+} // namespace lldb_private
diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt
index 8fcc8d62a79979..952f5411a98057 100644
--- a/lldb/unittests/ValueObject/CMakeLists.txt
+++ b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_lldb_unittest(LLDBValueObjectTests
   DumpValueObjectOptionsTests.cpp
+  DILLexerTests.cpp
 
   LINK_LIBS
     lldbValueObject
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp
new file mode 100644
index 00000000000000..ec6ff86b64d36b
--- /dev/null
+++ b/lldb/unittests/ValueObject/DILLexerTests.cpp
@@ -0,0 +1,193 @@
+//===-- DILLexerTests.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using llvm::StringRef;
+
+TEST(DILLexerTests, SimpleTest) {
+  StringRef dil_input_expr("simple_var");
+  uint32_t tok_len = 10;
+  lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+  lldb_private::dil::DILToken dil_token;
+  dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::unknown);
+  dil_lexer.Lex(dil_token);
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  EXPECT_EQ(dil_token.getSpelling(), "simple_var");
+  EXPECT_EQ(dil_token.getLength(), tok_len);
+  dil_lexer.Lex(dil_token);
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof);
+}
+
+TEST(DILLexerTests, TokenKindTest) {
+  StringRef dil_input_expr("namespace");
+  lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+  lldb_private::dil::DILToken dil_token;
+  dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+  dil_lexer.Lex(dil_token);
+  EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX);
+  dil_lexer.ResetTokenIdx(0);
+
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::kw_namespace);
+  EXPECT_TRUE(dil_token.isNot(lldb_private::dil::TokenKind::identifier));
+  EXPECT_FALSE(dil_token.is(lldb_private::dil::TokenKind::l_paren));
+  EXPECT_TRUE(dil_token.isOneOf(lldb_private::dil::TokenKind::eof,
+                                lldb_private::dil::TokenKind::kw_namespace));
+  EXPECT_FALSE(dil_token.isOneOf(lldb_private::dil::TokenKind::l_paren,
+                                 lldb_private::dil::TokenKind::r_paren,
+                                 lldb_private::dil::TokenKind::coloncolon,
+                                 lldb_private::dil::TokenKind::eof));
+
+  dil_token.setKind(lldb_private::dil::TokenKind::identifier);
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+}
+
+TEST(DILLexerTests, LookAheadTest) {
+  StringRef dil_input_expr("(anonymous namespace)::some_var");
+  lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+  lldb_private::dil::DILToken dil_token;
+  dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+  uint32_t expect_loc = 23;
+
+  dil_lexer.Lex(dil_token);
+  EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX);
+  dil_lexer.ResetTokenIdx(0);
+
+  // Current token is '('; check the next 4 tokens, to make
+  // sure they are the identifier 'anonymous', the namespace keyword,
+  // ')' and '::', in that order.
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::l_paren);
+  EXPECT_EQ(dil_lexer.LookAhead(0).getKind(),
+            lldb_private::dil::TokenKind::identifier);
+  EXPECT_EQ(dil_lexer.LookAhead(0).getSpelling(), "anonymous");
+  EXPECT_EQ(dil_lexer.LookAhead(1).getKind(),
+            lldb_private::dil::TokenKind::kw_namespace);
+  EXPECT_EQ(dil_lexer.LookAhead(2).getKind(),
+            lldb_private::dil::TokenKind::r_paren);
+  EXPECT_EQ(dil_lexer.LookAhead(3).getKind(),
+            lldb_private::dil::TokenKind::coloncolon);
+  // Verify we've advanced our position counter (lexing location) in the
+  // input 23 characters (the length of '(anonymous namespace)::'.
+  EXPECT_EQ(dil_lexer.GetLocation(), expect_loc);
+
+  // Our current index should still be 0, as we only looked ahead; we are still
+  // officially on the '('.
+  EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 0);
+
+  // Accept the 'lookahead', so our current token is '::', which has the index
+  // 4 in our vector of tokens (which starts at zero).
+  dil_token = dil_lexer.AcceptLookAhead(3);
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::coloncolon);
+  EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 4);
+
+  // Lex the final variable name in the input string
+  dil_lexer.Lex(dil_token);
+  dil_lexer.IncrementTokenIdx();
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  EXPECT_EQ(dil_token.getSpelling(), "some_var");
+  EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 5);
+
+  dil_lexer.Lex(dil_token);
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof);
+}
+
+TEST(DILLexerTests, MultiTokenLexTest) {
+  StringRef dil_input_expr("This string has several identifiers");
+  lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+  lldb_private::dil::DILToken dil_token;
+  dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+  dil_lexer.Lex(dil_token);
+  EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX);
+  dil_lexer.ResetTokenIdx(0);
+
+  EXPECT_EQ(dil_token.getSpelling(), "This");
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  dil_lexer.Lex(dil_token);
+  dil_lexer.IncrementTokenIdx();
+
+  EXPECT_EQ(dil_token.getSpelling(), "string");
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  dil_lexer.Lex(dil_token);
+  dil_lexer.IncrementTokenIdx();
+
+  EXPECT_EQ(dil_token.getSpelling(), "has");
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  dil_lexer.Lex(dil_token);
+  dil_lexer.IncrementTokenIdx();
+
+  EXPECT_EQ(dil_token.getSpelling(), "several");
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  dil_lexer.Lex(dil_token);
+  dil_lexer.IncrementTokenIdx();
+
+  EXPECT_EQ(dil_token.getSpelling(), "identifiers");
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  dil_lexer.Lex(dil_token);
+  dil_lexer.IncrementTokenIdx();
+
+  EXPECT_EQ(dil_token.getSpelling(), "");
+  EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof);
+}
+
+TEST(DILLexerTests, IdentifiersTest) {
+  std::vector<std::string> valid_identifiers = {
+    "$My_name1",
+    "$pc",
+    "abcd",
+    "ab cd",
+    "_",
+    "_a",
+    "_a_",
+    "a_b",
+    "this",
+    "self",
+    "a",
+    "MyName"
+  };
+  std::vector<std::string> invalid_identifiers = {
+    "234",
+    "2a",
+    "2",
+    "$",
+    "1MyName",
+    "",
+    "namespace"
+  };
+
+  // Verify that all of the valid identifiers come out as identifier tokens.
+  for (auto str : valid_identifiers) {
+    StringRef dil_input_expr(str);
+    lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+    lldb_private::dil::DILToken dil_token;
+    dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+    dil_lexer.Lex(dil_token);
+    EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+  }
+
+  // Verify that none of the invalid identifiers come out as identifier tokens.
+  for (auto str : invalid_identifiers) {
+    StringRef dil_input_expr(str);
+    lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+    lldb_private::dil::DILToken dil_token;
+    dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+    dil_lexer.Lex(dil_token);
+    EXPECT_TRUE(dil_token.isNot(lldb_private::dil::TokenKind::identifier));
+    EXPECT_TRUE(dil_token.isOneOf(lldb_private::dil::TokenKind::unknown,
+                                  lldb_private::dil::TokenKind::none,
+                                  lldb_private::dil::TokenKind::eof,
+                                  lldb_private::dil::TokenKind::kw_namespace));
+  }
+}

Update CMakeLists.txt to build DILLexer.cpp.

github-actions · 2025-01-19T17:29:29Z

✅ With the latest revision this PR passed the C/C++ code formatter.

werat · 2025-01-19T20:21:18Z

lldb/include/lldb/ValueObject/DILLexer.h

+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class DILToken {


Since we already have a dill:: namespace, maybe we can drop DIL prefix?

werat · 2025-01-19T20:24:26Z

lldb/include/lldb/ValueObject/DILLexer.h

+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {


I think if we accept StringRef as input, we shouldn't copy the data and work with the provided string view (and assume we don't outlive it).
If you need the lexer to own the text (and you probably don't), then accept std::string and move from it.

werat · 2025-01-19T20:32:23Z

lldb/include/lldb/ValueObject/DILLexer.h

+
+  /// Update the index for the 'current' token, to point to the next lexed
+  /// token.
+  bool IncrementTokenIdx() {


Shouldn't Lex() do this automatically?

Not really. Lex() is called from LookAhead, when we definitely do not want to automatically increment the token index.

werat · 2025-01-19T20:45:40Z

lldb/include/lldb/ValueObject/DILLexer.h

+  /// being handled by the DIL parser.
+  const DILToken &LookAhead(uint32_t N);
+
+  const DILToken &AcceptLookAhead(uint32_t N);


I think this API might be simpler. The lexer doesn't actually need to re-lex, the results will always be the same. We only need to rollback occasionally, but we'll always be process the same sequence of tokens the second time.

So the lexer can always add the tokens to the m_lexed_tokens vector and we only need GetCurrentTokenIdx() and ResetTokenIdx() to do the rollback.

That would definitely be nice, and if its true, I'd consider taking this even further: given that we're going to be processing about a line of text at most, we might be able to just eagerly lex the whole string and avoid the whole on-demand parsing business. If so, maybe we don't even need the lexer class and the lexing could consist of a single function like std::vector<Token> Lex(string) ?

Re werat's comment: This is already what the lexer does. It never re-lexes anything. It stores the lexed tokens in a vector as it lexes them, and keeps track, via index into the vector, which token is the 'current' one (as far as the parser is concerned). Accepting the lookahead basically involves just updating the index.

That's good to hear, but it still makes the implementation complicated. Separating lexing from the traversal through the lexed tokens would make things much easier to follow. I understand traditional lexers can't do that, but hey are parsing megabytes of text. We're going to be parsing approximately one line of code here.

Yesterday I also thought of another advantage of early/eager parsing -- we can report errors (things like invalid tokens and such) early. Then the other functions ("get next token" and stuff) shouldn't have any failure modes except running off the end of token stream (which I think we could handle by pretending the stream ends with an infinite amount of EOF tokens)

werat · 2025-01-19T20:45:58Z

lldb/source/ValueObject/DILLexer.cpp

+  }
+
+  uint32_t position = m_cur_pos - m_expr.begin();
+  ;


Suggested change

;

werat · 2025-01-19T20:46:25Z

lldb/source/ValueObject/DILLexer.cpp

+  bool look_ahead = true;
+  while (!done && remaining_tokens > 0) {
+    DILToken tok;
+    Lex(tok, look_ahead);


Should we check the return value here?

werat · 2025-01-19T20:47:31Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+TEST(DILLexerTests, SimpleTest) {
+  StringRef dil_input_expr("simple_var");
+  uint32_t tok_len = 10;
+  lldb_private::dil::DILLexer dil_lexer(dil_input_expr);


IMO can drop dil_ here. All these tests are in the context of DIL, so it's clear what it's all about.

Suggested change

lldb_private::dil::DILLexer dil_lexer(dil_input_expr);

lldb_private::dil::DILLexer lexer(input_expr);

labath · 2025-01-20T08:34:37Z

lldb/include/lldb/ValueObject/DILLexer.h

+
+namespace dil {
+
+enum class TokenKind {


Optional idea: Make this a non-class enum inside the (DIL)Token class so that it can be referred to as Token::coloncolon

labath · 2025-01-20T08:37:33Z

lldb/include/lldb/ValueObject/DILLexer.h

+  void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
+    m_kind = kind;
+    m_spelling = spelling;
+    m_start_pos = start;
+  }


could we use the assignment operator instead (token = Token(kind, spelling, start)) ?

labath · 2025-01-20T08:39:19Z

lldb/include/lldb/ValueObject/DILLexer.h

+    m_tokens_idx = UINT_MAX;
+  }
+
+  bool Lex(DILToken &result, bool look_ahead = false);


Suggested change

bool Lex(DILToken &result, bool look_ahead = false);

std::optional<DILToken> Lex(bool look_ahead = false);

labath · 2025-01-20T08:47:40Z

lldb/include/lldb/ValueObject/DILLexer.h

+  /// being handled by the DIL parser.
+  const DILToken &LookAhead(uint32_t N);
+
+  const DILToken &AcceptLookAhead(uint32_t N);


That would definitely be nice, and if its true, I'd consider taking this even further: given that we're going to be processing about a line of text at most, we might be able to just eagerly lex the whole string and avoid the whole on-demand parsing business. If so, maybe we don't even need the lexer class and the lexing could consist of a single function like std::vector<Token> Lex(string) ?

labath · 2025-01-20T08:50:40Z

lldb/include/lldb/ValueObject/DILLexer.h

+  /// Update 'result' with the other paremeter values, create a
+  /// duplicate token, and push the duplicate token onto the vector of
+  /// lexed tokens.
+  void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,


Should all of these APIs really be public?

A few of them can be private (I'll take care of that). Many of them could be protected (mostly just called from the parser, which I can designate as a friend), except that if I make them protected then my unittest, which needs to access them, can't access them. I've been trying and trying to find a way to allow my unit tests access these methods when they're protected, but I can't seem to make it work. Do you know the magic secret for this?

There are workarounds, but no magic secrets. If something is called by the parser (which is the only user of the class anyway), then it should be public (btw, friends can see even private members, not just the public ones). And unit tests should generally test using the public APIs (as that's what the users will use). There are exceptions to that, but they usually involve situations where the public API requires arguments that would be difficult/impossible to mock in a test, which shouldn't be the case here.

Separating the public API from the private implementation details would really help this class, as I really don't know which one of these functions is meant to be called from the outside.

labath · 2025-01-20T09:29:43Z

lldb/source/ValueObject/DILLexer.cpp

+  // Empty Token
+  result.setValues(dil::TokenKind::none, "", m_expr.length());
+  return false;


What is the meaning of an empty token?

Roughly the same as unknown. I will change the TokenKind above to unknown.

labath · 2025-01-20T09:40:37Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  };
+
+  // Verify that all of the valid identifiers come out as identifier tokens.
+  for (auto str : valid_identifiers) {


https://llvm.org/docs/CodingStandards.html#beware-unnecessary-copies-with-auto

labath · 2025-01-20T09:41:33Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+    StringRef dil_input_expr(str);
+    lldb_private::dil::DILLexer dil_lexer(dil_input_expr);


Suggested change

StringRef dil_input_expr(str);

lldb_private::dil::DILLexer dil_lexer(dil_input_expr);

SCOPED_TRACE(str);

lldb_private::dil::DILLexer dil_lexer(str);

the explicit string ref construction is unnecessary. SCOPED_TRACE ensures that in case of a test failure, the error message will contain the string which caused it to fail.

labath · 2025-01-20T09:53:01Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+}
+
+TEST(DILLexerTests, MultiTokenLexTest) {
+  StringRef dil_input_expr("This string has several identifiers");


I think this would read better if there was helper function that would lex the string and then return the lexing result in a nicely-comparable for. E.g., with a function like
std::vector<std::pair<TokenKind, std::string>> Lex(StringRef input), this whole test would boil down to

EXPECT_THAT(Lex("This string has several identifiers"), testing::ElementsAre(testing::Pair(TokenKind::identifier, "This"), testing::Pair(TokenKind::identifier, "string"), testing::Pair(TokenKind::identifier, "has"), testing::Pair(TokenKind::identifier, "several"), testing::Pair(TokenKind::identifier, "identifiers"), testing::Pair(TokenKind::eof, "")));

Making it easy to write test like this is important, as I expect most of the lexer tests to be of this form, and I want to encourage people to write more tests like this for various corner cases, of which there are many in a lexer implementation (for example, test that a colon at the of a string does not access beyond the end of the string when it checks whether it's a part of ::)

labath · 2025-01-20T10:10:22Z

lldb/source/ValueObject/DILLexer.cpp

+    return true;
+  }
+
+  switch (*m_cur_pos) {


An iterator is sort of a natural representation of a position in a string, but it also makes it hard to use many of the nice (safe) string APIs. For example, if we had a StringRef m_rest to represent the unparsed portion of the string, then this block could be written as:

constexpr std::pair<TokenKind, const char *> operators = { {TokenKind::l_paren, "("}, {TokenKind::r_paren, ")"}, {TokenKind::coloncolon, "::"}, {TokenKind::colon, ":"}, }; size_t position = m_rest.data() - m_input.data(); for (auto [kind, str]: operators) { if (m_rest.consume_front(str)) { UpdateLexedTokens(result, kind, str, position); return true; } }

(Notice how I do not have to worry about running beyond the end of the string, how the "current position" is updated automatically, and how the tokens which are prefixes of one another are handled by putting the longer string first.)

There are also other places where this could be useful. E.g. skipping over whitespace could be implemented as m_rest = m_rest.ltrim() and similar.

lldb/include/lldb/ValueObject/DILLexer.h

- Remove "DIL" prefix from DILTokenKind and DILToken. - Change the token kind from an enum class to an enum inside the Token class. - Use CamelCase for all the method names. - Replace Token::SetValues method with assignments. - Use a StringRef, not std::string, to hold the input string in the lexer. - Update the lexer to lex all the tokens at one time. Added two new methods for this: LexAll and GetNextToken. - Made some of the Lexer methods private. - Replaces StringMap with StringSwitch for fast keyword lookups. - Updated GetTokenName to directly return StringRefs; removed default case from switch statement. - Cleaned up code format in IsLetter & IsDigit. - Updated IsWord too return an iterator range containing the word (if any). - Updated Lex function (now called by LexAll) to return an llvm::Expected token; removed look_ahead checks; changed the operator lexing to use a vector of operators (as suggested). - Cleaned up LookAhead method, now that we know all tokens have already been lexed. - Added helper function to unittests, to help check a sequence of tokens. - Generally cleaned up the tests to deal with all the code changes.

cmtice · 2025-01-26T01:00:22Z

I think I have addressed all the comments & requests. Please review this again. Thanks!

labath

There are some simplifications (and one rewrite :P) that I'd like to talk about, but I think we're not far.

The main thing that's bothering me is this identifier vs. keyword issue (and for this part, I'd like to loop in @jimingham (at least)).

Your implementation takes a very strict view of what's a possible identifier (it must consist of a very specific set of characters, appearing in a specific order, and it must not be a keyword). In contrast, the current "frame variable" implementation basically treats anything it doesn't recognise (including whitespace) as a variable name (and it has no keywords):

(lldb) v a*b
error: no variable named 'a*b' found in this frame
(lldb) v "a b"
error: no variable named 'a b' found in this frame
(lldb) v 123
error: no variable named '123' found in this frame
(lldb) v namespace
error: no variable named 'namespace' found in this frame

Now, obviously, in order to expand the language, we need to restrict the set of variable names, but I don't think we need to do it so aggressively. I don't think anyone will complain if we make it harder for him to access a variable called a*b, but for example, namespace, and 💩 are perfectly valid variable names in many languages (one of them is C).

For this reason, I think it'd be better to have a very liberal definition of what constitutes a possible variable name (identifier). Instead of a prescribing a character sequence, I think it be better to say that anything that doesn't contain one of the characters we recognize (basically: operators) is an identifier. IOW, to do something like frame variable does right now.

As for keywords, I think it'd be best to have as few of them as possible, and for the rest, to treat their keyword-ness as context-dependent whereever possible. What I mean by that is to recognise them as keywords only within contexts where such usage would be legal. The namespace keyword is a prime example of that. I think the only place where this can appear as a keyword in the context of the DIL is within the (anonymous namespace) group. If that's the case, then I think we should be able to detect that and disambiguate the usage, so that e.g. v namespace prints the variable called namespace and v (anonymous namespace)::namespace prints the variable called namespace in an anonymous namespace (in an evil language which has anonymous namespaces but allows you to use variables called namespace).

The way to do that is probably to not treat the string "namespace" as a keyword at the lexer level, but to recognize it later, when we have more context available.

What do you all think?

labath · 2025-01-27T09:54:00Z

lldb/include/lldb/ValueObject/DILLexer.h

+namespace lldb_private {
+
+namespace dil {


Suggested change

namespace lldb_private {

namespace dil {

namespace lldb_private::dil {

labath · 2025-01-27T09:57:09Z

lldb/include/lldb/ValueObject/DILLexer.h

+  Token() : m_kind(Kind::none), m_spelling(""), m_start_pos(0) {}
+
+  void SetKind(Kind kind) { m_kind = kind; }


Could we get rid of these (and of the none token kind)? Ideally, I'd like to treat the token as a value type (so you can assign and copy it, but not mess with it), and one that is always valid (no uninitialized (none) state).

I can get rid of the constructor on line 44, and I can get rid of the none token, but I need to keep SetKind, in order to separate ">>" into two ">"s, in the case where we have determined that ">>" is actually two single angle brackets closing a template, such as std::vector<std::pair<int,int>>. In that case, I'll also need to insert a token into the middle of the tokens vector -- not great, but may not happen that often, and the vector shouldn't be that long.

That doesn't remove your ability to modify a token, just how you how you do it (you update the token as a whole, instead of doing it piece wise). So, instead of something like token.SetKind(right_angle); token.SetSpelling(">"); you'd do token = Token(right_angle, ">", token.GetPosition()).

That said, modification of the token stream and multi-pass (tentative) parsing doesn't sound like an ideal combination. How do you ensure that the modification does not leak into subsequent actions if the tentative parse is aborted?

Even if you can, the fact that you have to consider this makes me uneasy. Would it be possible to do this without modifying the token stream? So like, when the parser (I'm assuming this happens in the parser, as you need more context to disambiguate these) encounters a ">>" token and the current context allows you to treat this as a double closing template bracket, you act as if you encountered two ">" tokens, but without actually updating the tokens in the lexer.

labath · 2025-01-27T10:30:10Z

lldb/include/lldb/ValueObject/DILLexer.h

+
+  llvm::Expected<bool> LexAll();
+
+  /// Return the lexed token N+1 positions ahead of the 'current' token


I feel like there are too many ways to navigate the token stream here. You can either call GetCurrentToken+IncrementTokenIdx, or GetNextToken(which I guess increments the index automatically), or LookAhead+AcceptLookAhead.

I think it would be better to start with something simple (we can add more or revamp the existing API if it turns out to be clunky). What would you say to something like:

const Token &LookAhead(uint32_t N /* add `=1` if you want*/); const Token &GetCurrentToken() { return LookAhead(0); } // just a fancy name for a look ahead of zero void Advance(uint32_t N = 1); // advance the token stream

The parser really needs a way to save & restore/reset the token index, because there are places in the parser where it does tentative parsing & then decides to rollback. It does so by saving the current token index, then doing the tentative parsing (which can advance the index some number of tokens), and then (for rolling back) setting the current token index back to the saved value.

So I don't think the simple API you've outlined above would be sufficient.

I'm fine with tentative parse and roll back APIs. I'm commenting on the other APIs which advance through the token stream linearly (but in a very baroque fashion). IOW, my proposal was to replace GetCurrentToken, IncrementTokenIdx, GetNextToken, LookAhead and AcceptLookAhead with the three functions above (exact names TBD), and keep GetCurrentTokenIdx and ResetTokenIdx as they are.

labath · 2025-01-27T10:33:18Z

lldb/include/lldb/ValueObject/DILLexer.h

+  bool ResetTokenIdx(uint32_t new_value) {
+    if (new_value > m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx = new_value;
+    return true;
+  }


Suggested change

bool ResetTokenIdx(uint32_t new_value) {

if (new_value > m_lexed_tokens.size() - 1)

return false;

m_tokens_idx = new_value;

return true;

}

void ResetTokenIdx(uint32_t new_value) {

assert(new_value < m_lexed_tokens.size());

m_tokens_idx = new_value;

}

(AIUI, the only usage of this function will be to restore a previous (and valid) position, so any error here is definitely a bug)

labath · 2025-01-27T10:33:49Z

lldb/include/lldb/ValueObject/DILLexer.h

+    return true;
+  }
+
+  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }


should this be private?

labath · 2025-01-27T11:42:56Z

lldb/source/ValueObject/DILLexer.cpp

+}
+
+const Token &DILLexer::LookAhead(uint32_t N) {
+  if (m_tokens_idx + N + 1 < m_lexed_tokens.size())


You didn't say anything about what you make of my suggestion to use an "infinite" stream of eof tokens at the end. The current implementation uses an infinite stream of invalid tokens, but I doubt that anything cares whether it's working with an invalid or eof token (if you're looking ahead, chances are you know what you're expecting to find, and you just want to know whether your token matches that expectation).

That would allow us to get rid of another pseudo-token and the m_invalid_token member.

I think the "infinite" stream of eof tokens at the end is ok; I will give it a try. :-)

labath · 2025-01-27T11:48:52Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+
+using llvm::StringRef;
+
+bool VerifyExpectedTokens(


There's a reason why I used EXPECT_THAT(..., ElementsAre() in my example -- it gives much better error messages in case of failure: It will print the expected vector, the actual vector, and also the element which differs.

With an implementation like this, all you get is error: false != true. Which one would you rather debug?

labath · 2025-01-27T11:50:26Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  auto success = lexer.LexAll();
+
+  if (!success) {
+    EXPECT_TRUE(false);
+  }


Suggested change

auto success = lexer.LexAll();

if (!success) {

EXPECT_TRUE(false);

}

ASSERT_THAT_EXPECTED(lexer.LexAll(), llvm::HasValue(true));`

labath · 2025-01-27T11:50:37Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  EXPECT_EQ(token.GetSpelling(), "simple_var");
+  EXPECT_EQ(token.GetLength(), tok_len);
+  token = lexer.GetNextToken();
+  ;


labath · 2025-01-27T11:51:58Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  token = lexer.GetNextToken();
+  EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
+  EXPECT_EQ(token.GetSpelling(), "simple_var");
+  EXPECT_EQ(token.GetLength(), tok_len);


I'm wondering if the GetLength function is really necessary. How often is it going to be called? Because if it's not very often, we might as well use GetSpelling().size().

jimingham · 2025-01-29T00:30:47Z

There are some simplifications (and one rewrite :P) that I'd like to talk about, but I think we're not far.

The main thing that's bothering me is this identifier vs. keyword issue (and for this part, I'd like to loop in @jimingham (at least)).

Your implementation takes a very strict view of what's a possible identifier (it must consist of a very specific set of characters, appearing in a specific order, and it must not be a keyword). In contrast, the current "frame variable" implementation basically treats anything it doesn't recognise (including whitespace) as a variable name (and it has no keywords):
(lldb) v a*b
error: no variable named 'a*b' found in this frame
(lldb) v "a b"
error: no variable named 'a b' found in this frame
(lldb) v 123
error: no variable named '123' found in this frame
(lldb) v namespace
error: no variable named 'namespace' found in this frame
Now, obviously, in order to expand the language, we need to restrict the set of variable names, but I don't think we need to do it so aggressively. I don't think anyone will complain if we make it harder for him to access a variable called a*b, but for example, namespace, and 💩 are perfectly valid variable names in many languages (one of them is C).

For this reason, I think it'd be better to have a very liberal definition of what constitutes a possible variable name (identifier). Instead of a prescribing a character sequence, I think it be better to say that anything that doesn't contain one of the characters we recognize (basically: operators) is an identifier. IOW, to do something like frame variable does right now.

As for keywords, I think it'd be best to have as few of them as possible, and for the rest, to treat their keyword-ness as context-dependent whereever possible. What I mean by that is to recognise them as keywords only within contexts where such usage would be legal. The namespace keyword is a prime example of that. I think the only place where this can appear as a keyword in the context of the DIL is within the (anonymous namespace) group. If that's the case, then I think we should be able to detect that and disambiguate the usage, so that e.g. v namespace prints the variable called namespace and v (anonymous namespace)::namespace prints the variable called namespace in an anonymous namespace (in an evil language which has anonymous namespaces but allows you to use variables called namespace).

The way to do that is probably to not treat the string "namespace" as a keyword at the lexer level, but to recognize it later, when we have more context available.

What do you all think?

The only way that the DIL lexer can handle keywords correctly would be if it knew in advance what language it was parsing for. Otherwise, we're going to be over broad for some language we support. Another consideration is that the DIL also is expected to parse the names given to children by synthetic child providers. At present we don't put any restrictions on those, so if you wanted you could produce a synthetic child called a*b... Again, if we wanted to start identifying keywords, we'd need to restrict the names of synthetic child providers to "legal for this language".

So I agree with Pavel, as much as possible the lexer should be permissive, really only handling the tokens we use to delimit words - ., -> and now that we're supporting casts (), but staying out of the business of imposing rules from any particular language.

For instance, currently the name of the synthetic child providers for the nth element of a std::vector is [0]:

(lldb) n
Process 79383 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = step over
    frame #0: 0x0000000100000548 foo`main at foo.cpp:7
   4   	main()
   5   	{
   6   	  std::vector<int> my_vec = {1, 3, 5};
-> 7   	  return my_vec[1];
    	         ^
   8   	}
Target 0: (foo) stopped.
(lldb) script
Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
>>> var = lldb.frame.FindVariable("my_vec")
>>> var.GetChildAtIndex(0).GetName()
'[0]'

So the lexer can't know about [] as a separate entity. It has to just be a name.

labath · 2025-01-29T10:48:11Z

I agree with everything except for the last part. The current parser already threats [] very specially. I think it has to do that so it can treat pointers as C arrays (and then it just special cases synthetic objects). I think that's a (mostly *) reasonable setup that we could replicate in the new DIL implementation, particularly as we would need special handling of [] to implement things like [i+1].

FWIW, I believe that the main source of keywords in Caroline's implementation is types (for casts). I think they could be handled generically (just ask the target whether the identifier names a type), were it not for the nasty C(++) habit of creating multiword types (unsigned long long long long int). Still, it feels there ought to be recognise these without making unsigned a full-fledged keyword.

(*) I was recently made aware of an unfortunate difference in behavior of frame var and expr for map types:

(lldb) v m
(std::map<int, int>) m = size=3 {
  [0] = (first = -42, second = -47)
  [1] = (first = 0, second = 42)
  [2] = (first = 42, second = 47)
}
(lldb) v m[0]
(std::__1::__value_type<int, int>::value_type) m[0] = (first = -42, second = -47)
(lldb) expr m[0]
(std::map<int, int>::mapped_type) $0 = 42

I know that these are different languages, but this still seems like it could confuse some people. I don't have any specific ideas on how to fix/improve this though...

cmtice · 2025-01-29T17:28:37Z

If DIL is not allowed to recognize any operators except period, arrow, and parentheses, it seems to me that it won't be any more powerful than the current 'frame variable' implementation, in which case what is the point of having it at all? (As a matter of fact, it would be less powerful, since 'frame variable' also recognizes square brackets and star as valid operators and treats them accordingly).

At the same time, I understand the concern about wanting to be able to parse synthetic variable names, which can contain arbitrary operator-like symbols. So I have a tentative proposal: How about insisting that synthetic variable names need to be surrounded by special delimiters, such as backquote or slash? Or having some other way of marking them as special?

jimingham · 2025-01-29T18:51:07Z

If DIL is not allowed to recognize any operators except period, arrow, and parentheses, it seems to me that it won't be any more powerful than the current 'frame variable' implementation, in which case what is the point of having it at all? (As a matter of fact, it would be less powerful, since 'frame variable' also recognizes square brackets and star as valid operators and treats them accordingly).

At the same time, I understand the concern about wanting to be able to parse synthetic variable names, which can contain arbitrary operator-like symbols. So I have a tentative proposal: How about insisting that synthetic variable names need to be surrounded by special delimiters, such as backquote or slash? Or having some other way of marking them as special?

These are unfortunately visible to the user, and they will have to type them in frame var to access the child. So we can't make them something ugly.

cmtice · 2025-01-29T18:58:36Z

If DIL is not allowed to recognize any operators except period, arrow, and parentheses, it seems to me that it won't be any more powerful than the current 'frame variable' implementation, in which case what is the point of having it at all? (As a matter of fact, it would be less powerful, since 'frame variable' also recognizes square brackets and star as valid operators and treats them accordingly).
At the same time, I understand the concern about wanting to be able to parse synthetic variable names, which can contain arbitrary operator-like symbols. So I have a tentative proposal: How about insisting that synthetic variable names need to be surrounded by special delimiters, such as backquote or slash? Or having some other way of marking them as special?

These are unfortunately visible to the user, and they will have to type them in frame var to access the child. So we can't make them something ugly.

I don't want to make them ugly, but I do think we need to ask the user to distinguish them in some way. Otherwise I don't see how we can make DIL any more powerful than the current 'frame variable'.

jimingham · 2025-01-29T19:07:39Z

I agree with everything except for the last part. The current parser already threats [] very specially. I think it has to do that so it can treat pointers as C arrays (and then it just special cases synthetic objects). I think that's a (mostly *) reasonable setup that we could replicate in the new DIL implementation, particularly as we would need special handling of [] to implement things like [i+1].

We would have to change how synthetic children are made currently for array like things for this to work. If I have a std:vector called foo and I write:

(lldb) v foo[0]

and the DIL parses that it will either ask foo for a child named 0, which doesn't exist, or ask it for the 0th child which currently is not required to be the one called [0]. I think I mentioned this back when we were first discussing this, but if we want to make this part work, then we have to add some API on the synthetic child provider like AddVectorChild(size_t idx, ValueObjectSP child_sp) and require you use that to make vector like things. Then we could require that the child named [0] HAS to be the 0th child, etc.

We still have the problem of all the synthetic child providers in the wild that might not do this.

FWIW, I believe that the main source of keywords in Caroline's implementation is types (for casts). I think they could be handled generically (just ask the target whether the identifier names a type), were it not for the nasty C(++) habit of creating multiword types (unsigned long long long long int). Still, it feels there ought to be recognise these without making unsigned a full-fledged keyword.

That comes back to how soon we expect to know the language as we are parsing. unsigned is a perfectly good variable name in swift, for instance... I don't think we want to limit lldb to only C-family languages.

(*) I was recently made aware of an unfortunate difference in behavior of frame var and expr for map types:
(lldb) v m
(std::map<int, int>) m = size=3 {
  [0] = (first = -42, second = -47)
  [1] = (first = 0, second = 42)
  [2] = (first = 42, second = 47)
}
(lldb) v m[0]
(std::__1::__value_type<int, int>::value_type) m[0] = (first = -42, second = -47)
(lldb) expr m[0]
(std::map<int, int>::mapped_type) $0 = 42
I know that these are different languages, but this still seems like it could confuse some people. I don't have any specific ideas on how to fix/improve this though...

Yes, collection classes that use arbitrary types for the keys are problematic. How would we show the map if the keys are structures with some non-trivial compare function in a way that wouldn't be awful to use? I don't think we can get around people having to know that v path expressions are NOT expressions in the underlying language.

labath · 2025-01-30T08:53:25Z

We would have to change how synthetic children are made currently for array like things for this to work. If I have a std:vector called foo and I write:

(lldb) v foo[0]

and the DIL parses that it will either ask foo for a child named 0, which doesn't exist, or ask it for the 0th child which currently is not required to be the one called [0]. I think I mentioned this back when we were first discussing this, but if we want to make this part work, then we have to add some API on the synthetic child provider like AddVectorChild(size_t idx, ValueObjectSP child_sp) and require you use that to make vector like things. Then we could require that the child named [0] HAS to be the 0th child, etc.

We still have the problem of all the synthetic child providers in the wild that might not do this.

I don't understand why we couldn't emulate exactly what happens now. Current frame variable doesn't simply ask for the child named [0]. It can't do that because pointers don't have children like that. Instead, it parsed the thing between the brackets into an integer, and then calls ValueObject::GetSyntheticArrayMember with that integer. This function turns the integer back to a string, re-adds the brackets and then asks for the synthetic child with that name.

This roundaboutness has a very nice property in that you can specify the child number in any way you want, and it will get normalized to the right form:

(lldb) v m[1]
(std::__1::__value_type<int, int>::value_type) m[1] = (first = 0, second = 42)
(lldb) v m[0000001]
(std::__1::__value_type<int, int>::value_type) m[0000001] = (first = 0, second = 42)
(lldb) v m[0x000001]
(std::__1::__value_type<int, int>::value_type) m[0x000001] = (first = 0, second = 42)

And it definitely gives the impression that lldb "sees" into the array index, so I wouldn't at all be surprised if someone tried v m[1+1] and expected it to produce the child called [2].

That's not to say that having the formatters provide more information about the structure of the children wouldn't be nice. For example, in the DAP protocol, we're expected to divide the children of an object into "named" and "indexed" categories. We can't do that right now, as we don't get that information. However, I don't think this needs to be tied to this work in any way.

These are unfortunately visible to the user, and they will have to type them in frame var to access the child. So we can't make them something ugly.

I don't want to make them ugly, but I do think we need to ask the user to distinguish them in some way. Otherwise I don't see how we can make DIL any more powerful than the current 'frame variable'.

I think there could be some reasonable middle ground here. Quoting/escapting every synthetic child is not going to fly. However, I wouldn't have a problem with asking users to quote "weird" child names. So, like, if there's a variable called unsigned somewhere, we'd want v unsigned to work. But if some formatter decides to name its child a*b (or even a.b), then i think it's fine if we ask users to mark that specially. We would need to be careful about the choice of quoting mechanism, and we'd probably need to change the frame variable command to take "raw" input to prevent all the quotes from getting stripped, but I think it's doable.

Also, if we have this feature, I don't think it needs to be limited to synthetic children. There's no reason why not to use the same mechanism for regular variables as well. Even though most (but not all) languages do not support weird variable names like a*b, DWARF is definitely capable of expressing them, so we might as well support that.

Yes, collection classes that use arbitrary types for the keys are problematic. How would we show the map if the keys are structures with some non-trivial compare function in a way that wouldn't be awful to use? I don't think we can get around people having to know that v path expressions are NOT expressions in the underlying language.

Complex key types are hard for sure, and I think the current setup is perfectly reasonable for them. It's just that the result for integral keys (which are pretty common) is not ideal. It would be nice if users had a way to answer questions like "in my map of million keys, which value does the number 47 map to?" without an expression evaluator (which will often not compile). Any solution here is most certainly going to require the data formatters to provide more info...

- Remove 'namespace' as a keyword (make it a normal identifier) - Remove 'invalid' and 'none' token types. - Remove unnecessary SetKind and GetLength methods from Tokens. - Re-arrange Lexer: - Give it a static Create method, which pre-lexes all the tokens - Make Lex method static - Pull IsWord method out of Lexer class - Make the Lexer constructor private. - Remove LexAll, GetLocation, UpdateLexedTokens, AcceptLookAhead, GetNextToken, and IncrementTokenIdx methods from Lexer class. - Add new 'Advance' method (to help replace some of the removed methods). - Update indexing in LookAhead (LookAead(0) now means the 'current' token). - Remove m_cur_pos data member from Lexer class. - Replace m_invalid_token with m_eof_token. - Use 'remainder' StringRef to help with lexing. - Update the unit tests to handle all the code changes in the Lexer. - Update the unit tests to use ASSERT_THAT_EXPECTED to check llvm::Expected return values. - Update the unit tests to use "testing::ElementsAre(testing::Pair ..." to verify all the lexed tokens; also added helper function ExtractTokenData, and deleted function VerifyExpectedTokens.

cmtice · 2025-01-30T23:10:52Z

I think I have addressed/fixed all the latest review comments. Please take another look now.

labath

Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere. And it doesn't allow any non-ascii characters.

I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f) into the expression, then so be it.

The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x is a valid identifier but 123u is not, but I suspect some might have a different opinion.

We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you..

labath · 2025-01-31T08:33:10Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
+  EXPECT_EQ(token.GetSpelling(), "simple_var");
+  EXPECT_EQ(token.GetSpelling().size(), tok_len);


Now the size check is unnecessary, as this is just testing the StringRef implementation

labath · 2025-01-31T08:39:27Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  StringRef input_expr("namespace");
+  llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+      lldb_private::dil::DILLexer::Create(input_expr);
+  ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+  lldb_private::dil::DILLexer lexer(*maybe_lexer);
+  lldb_private::dil::Token token =
+      lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+  lexer.Advance();
+  token = lexer.GetCurrentToken();


Suggested change

StringRef input_expr("namespace");

llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =

lldb_private::dil::DILLexer::Create(input_expr);

ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());

lldb_private::dil::DILLexer lexer(*maybe_lexer);

lldb_private::dil::Token token =

lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);

lexer.Advance();

token = lexer.GetCurrentToken();

lldb_private::dil::Token token =

lldb_private::dil::Token(lldb_private::dil::Token::identifier, "ident", 0);

(The reason is that this looks like a test for the token kind testing functions. You don't actually need the token to come out of the lexer for that. And we already have tests that check that producing a sequence of characters produces an "identifier" token.)

labath · 2025-01-31T08:40:10Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+
+  // Our current index should still be 0, as we only looked ahead; we are still
+  // officially on the '('.
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0);


Suggested change

EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0);

EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);

(and elsewhere)

labath · 2025-01-31T08:45:47Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)5);
+  // Verify we've advanced our position counter (lexing location) in the
+  // input 23 characters (the length of '(anonymous namespace)::'.
+  EXPECT_EQ(token.GetLocation(), expect_loc);


you might just say strlen("(anonymous namespace)::") and skip the constant and the comment (this is a test, so it doesn't have to be super-efficient)

labath · 2025-01-31T08:52:57Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  StringRef input_expr("This string has (several ) ::identifiers");
+  llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+      lldb_private::dil::DILLexer::Create(input_expr);
+  ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+  lldb_private::dil::DILLexer lexer(*maybe_lexer);
+  lldb_private::dil::Token token =
+      lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+
+  std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>>
+      lexer_tokens_data = ExtractTokenData(lexer);
+
+  EXPECT_THAT(
+      lexer_tokens_data,
+      testing::ElementsAre(
+          testing::Pair(lldb_private::dil::Token::identifier, "This"),
+          testing::Pair(lldb_private::dil::Token::identifier, "string"),
+          testing::Pair(lldb_private::dil::Token::identifier, "has"),
+          testing::Pair(lldb_private::dil::Token::l_paren, "("),
+          testing::Pair(lldb_private::dil::Token::identifier, "several"),
+          testing::Pair(lldb_private::dil::Token::r_paren, ")"),
+          testing::Pair(lldb_private::dil::Token::coloncolon, "::"),
+          testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),
+          testing::Pair(lldb_private::dil::Token::eof, "")));
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetSpelling(), "");
+  EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof);


Suggested change

StringRef input_expr("This string has (several ) ::identifiers");

llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =

lldb_private::dil::DILLexer::Create(input_expr);

ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());

lldb_private::dil::DILLexer lexer(*maybe_lexer);

lldb_private::dil::Token token =

lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);

std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>>

lexer_tokens_data = ExtractTokenData(lexer);

EXPECT_THAT(

lexer_tokens_data,

testing::ElementsAre(

testing::Pair(lldb_private::dil::Token::identifier, "This"),

testing::Pair(lldb_private::dil::Token::identifier, "string"),

testing::Pair(lldb_private::dil::Token::identifier, "has"),

testing::Pair(lldb_private::dil::Token::l_paren, "("),

testing::Pair(lldb_private::dil::Token::identifier, "several"),

testing::Pair(lldb_private::dil::Token::r_paren, ")"),

testing::Pair(lldb_private::dil::Token::coloncolon, "::"),

testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),

testing::Pair(lldb_private::dil::Token::eof, "")));

lexer.Advance();

token = lexer.GetCurrentToken();

EXPECT_EQ(token.GetSpelling(), "");

EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof);

EXPECT_THAT_EXPECTED(

ExtractTokenData("This string has (several ) ::identifiers"),

llvm::HasValue(testing::ElementsAre(

testing::Pair(lldb_private::dil::Token::identifier, "This"),

testing::Pair(lldb_private::dil::Token::identifier, "string"),

testing::Pair(lldb_private::dil::Token::identifier, "has"),

testing::Pair(lldb_private::dil::Token::l_paren, "("),

testing::Pair(lldb_private::dil::Token::identifier, "several"),

testing::Pair(lldb_private::dil::Token::r_paren, ")"),

testing::Pair(lldb_private::dil::Token::coloncolon, "::"),

testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),

testing::Pair(lldb_private::dil::Token::eof, ""))));

(and have ExtractTokenData construct the lexer internally and return llvm::Expected<std::vector<...>>)

I think this will reduce the boiler plate in all subsequent tests.

labath · 2025-01-31T09:53:08Z

lldb/source/ValueObject/DILLexer.cpp

+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+  llvm::StringRef::iterator start = cur_pos;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (cur_pos == expr.end() || IsDigit(*cur_pos))
+    return std::nullopt;
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*cur_pos == '$') {
+    dollar_start = true;
+    ++cur_pos;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; cur_pos != expr.end(); ++cur_pos) {
+    char c = *cur_pos;
+    if (!IsLetter(c) && !IsDigit(c) && c != '_')
+      break;
+  }
+
+  // If first char is '$', make sure there's at least one mare char, or it's
+  // invalid.
+  if (dollar_start && (cur_pos - start <= 1)) {
+    cur_pos = start;
+    return std::nullopt;
+  }
+
+  if (cur_pos == start)
+    return std::nullopt;
+
+  llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
+  if (remainder.consume_front(word))
+    return word;
+
+  return std::nullopt;


This should be equivalent to your code, but I'm only suggesting this to show how code like this can be written in a shorter and more stringref-y fashion. I think the actual algorithm will have to change, as it has a very narrow definition of identifiers.

Suggested change

llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();

llvm::StringRef::iterator start = cur_pos;

bool dollar_start = false;

// Must not start with a digit.

if (cur_pos == expr.end() || IsDigit(*cur_pos))

return std::nullopt;

// First character *may* be a '$', for a register name or convenience

// variable.

if (*cur_pos == '$') {

dollar_start = true;

++cur_pos;

}

// Contains only letters, digits or underscores

for (; cur_pos != expr.end(); ++cur_pos) {

char c = *cur_pos;

if (!IsLetter(c) && !IsDigit(c) && c != '_')

break;

}

// If first char is '$', make sure there's at least one mare char, or it's

// invalid.

if (dollar_start && (cur_pos - start <= 1)) {

cur_pos = start;

return std::nullopt;

}

if (cur_pos == start)

return std::nullopt;

llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);

if (remainder.consume_front(word))

return word;

return std::nullopt;

const char *start = remainder.data();

remainder.consume_front("$"); // initial '$' is valid

remainder = remainder.drop_while([](char c){ return IsDigit(c) || IsLetter(c) || c=='_'; });

llvm::StringRef candidate(start, remainder.data()-start);

if (candidate.empty() || candidate == "$" || IsDigit(candidate[0]))

return std::nullopt;

return candidate;

(Not very surprisingly) I find it easier to understand my code than your code, so I would prefer to keep my code. But if you really want me to use something like your code instead I will ...

One question: When constructing 'candidate', you've already removed all the characters that should comprise the candidate word from remainder. Since those characters are no longer there, doesn't that make 'start' invalid? (It's pointing to something that's been removed from 'remainder'?). How can 'remainder.data() - start' possibly work at this point?

The key thing to realize here is that a StringRef is just a fancy name for a pair<const char *, size_t>. On its own, it doesn't make anything valid. It's only valid as long as the string it points to is valid. And the same goes for the validity pointers you get from the StringRef. This is just a pointer to the parsed string, which will remain valid for as long as that string is around -- regardless of what happens to the StringRef.

That said, I realized a different problem with this code. It will end up modifying the remainder even in the failure case. So here's a slightly different version which avoids that. This one isn't completely equivalent, as I've dropped the requirement on the positioning of the dollar sign, but that's something I very strongly believe we should do anyway.

StringRef candidate = remainder.take_while([](char c) { return IsDigit(c) || IsLetter(c) || c=='_' || c=='$'; }); if (candidate.empty() || IsDigit(candidate[0])) return std::nullopt; remainder.drop_front(candidate.size()); return candidate;

labath · 2025-01-31T09:54:43Z

lldb/source/ValueObject/DILLexer.cpp

+                                    llvm::StringRef &remainder) {
+  // Skip over whitespace (spaces).
+  remainder = remainder.ltrim();
+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();


Suggested change

llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();

llvm::StringRef::iterator cur_pos = remainder.begin();

Your suggestion is not correct. "cur_pos" is supposed to refer to the position of the currently-being-lexed word/token/etc in the original input string, not its position in remainder.

But that's the same thing. Both stringrefs are just pointers to the same underlying string. StringRef::iterator is just a fancy name for const char *. Would it look better to you if this was written as const char *cur_pos = remainder.data(); ?

Or even better, since this is only used constructing the position value below (I'm ignoring the check below), you can just drop this and construct position directly as remainder.data()-expr.data()

"Both stringrefs are pointers to the same underlying string" -- I did not know that. SO...there's a string (const char *) in memory, that both StringRefs point to, and updating the StringRefs never updates the underlying memory contents? Hm...it would be nice if there was documentation somewhere that made this clear (the doxygen stuff does NOT). Thanks for the explanation & clarification.

labath · 2025-01-31T09:55:14Z

lldb/source/ValueObject/DILLexer.cpp

+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+
+  // Check to see if we've reached the end of our input string.
+  if (remainder.empty() || cur_pos == expr.end())


Suggested change

if (remainder.empty() || cur_pos == expr.end())

if (remainder.empty())

If the two conditions aren't equivalent, I'd like to know why.

Because they're checking on two different StringRefs? If this function has been called properly (with the correct arguments), then the two probablye should be equivalent. But I don't like to make assumptions like that without checking them...

When I see code like this I start to wonder whether there's an edge case I'm missing. Defensive checks may make sense on some external API which is called from a bunch of places, but here we're talking about a private function that's only called from a single place. At best that deserves an assertion (you could e.g. say assert(expr.end() == remainder.end()) at the beginning of the function.

labath · 2025-01-31T09:55:56Z

lldb/source/ValueObject/DILLexer.cpp

+  if (maybe_word) {
+    llvm::StringRef word = *maybe_word;
+    return Token(Token::identifier, word.str(), position);
+  }


Suggested change

if (maybe_word) {

llvm::StringRef word = *maybe_word;

return Token(Token::identifier, word.str(), position);

}

if (maybe_word)

return Token(Token::identifier, maybe_word->str(), position);

(it's shorter, and I don't think the extra local variable helps anything)

labath · 2025-01-31T09:57:16Z

lldb/source/ValueObject/DILLexer.cpp

+    if (remainder.consume_front(str)) {
+      cur_pos += strlen(str);
+      return Token(kind, str, position);
+    }


Suggested change

if (remainder.consume_front(str)) {

cur_pos += strlen(str);

return Token(kind, str, position);

}

if (remainder.consume_front(str))

return Token(kind, str, position);

(cur_pos isn't used after this)

jimingham · 2025-01-31T17:11:05Z

On Jan 31, 2025, at 2:21 AM, Pavel Labath ***@***.***> wrote: @labath commented on this pull request. Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere <https://godbolt.org/z/o7qbfeWve>. And it doesn't allow any non-ascii characters. I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f) into the expression, then so be it. The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x is a valid identifier but 123u is not, but I suspect some might have a different opinion.

Are there any languages we know of that allow digits as the initial identifier character? Seems like that just makes parsing the language hard for no very clear benefit, so I'd be surprised if there are many that allow that. If there aren't any that we're likely to have to support, I wouldn't object to not treating tokens beginning with a number as an identifier.

…

We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you.. In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > +TEST(DILLexerTests, SimpleTest) { + StringRef input_expr("simple_var"); + uint32_t tok_len = 10; + llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = + lldb_private::dil::DILLexer::Create(input_expr); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + lldb_private::dil::DILLexer lexer(*maybe_lexer); + lldb_private::dil::Token token = + lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::unknown); + + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); + EXPECT_EQ(token.GetSpelling(), "simple_var"); + EXPECT_EQ(token.GetSpelling().size(), tok_len); Now the size check is unnecessary, as this is just testing the StringRef implementation In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + StringRef input_expr("namespace"); + llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = + lldb_private::dil::DILLexer::Create(input_expr); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + lldb_private::dil::DILLexer lexer(*maybe_lexer); + lldb_private::dil::Token token = + lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); + lexer.Advance(); + token = lexer.GetCurrentToken(); ⬇️ Suggested change - StringRef input_expr("namespace"); - llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = - lldb_private::dil::DILLexer::Create(input_expr); - ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); - lldb_private::dil::DILLexer lexer(*maybe_lexer); - lldb_private::dil::Token token = - lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); - lexer.Advance(); - token = lexer.GetCurrentToken(); + lldb_private::dil::Token token = + lldb_private::dil::Token(lldb_private::dil::Token::identifier, "ident", 0); (The reason is that this looks like a test for the token kind testing functions. You don't actually need the token to come out of the lexer for that. And we already have tests that check that producing a sequence of characters produces an "identifier" token.) In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + token = lexer.GetCurrentToken(); + + // Current token is '('; check the next 4 tokens, to make + // sure they are the identifier 'anonymous', the identifier 'namespace' + // ')' and '::', in that order. + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::l_paren); + EXPECT_EQ(lexer.LookAhead(1).GetKind(), lldb_private::dil::Token::identifier); + EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous"); + EXPECT_EQ(lexer.LookAhead(2).GetKind(), lldb_private::dil::Token::identifier); + EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace"); + EXPECT_EQ(lexer.LookAhead(3).GetKind(), lldb_private::dil::Token::r_paren); + EXPECT_EQ(lexer.LookAhead(4).GetKind(), lldb_private::dil::Token::coloncolon); + + // Our current index should still be 0, as we only looked ahead; we are still + // officially on the '('. + EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0); ⬇️ Suggested change - EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0); + EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u); (and elsewhere) In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + + // Accept the 'lookahead', so our current token is '::', which has the index + // 4 in our vector of tokens (which starts at zero). + lexer.Advance(4); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::coloncolon); + EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)4); + + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); + EXPECT_EQ(token.GetSpelling(), "some_var"); + EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)5); + // Verify we've advanced our position counter (lexing location) in the + // input 23 characters (the length of '(anonymous namespace)::'. + EXPECT_EQ(token.GetLocation(), expect_loc); you might just say strlen("(anonymous namespace)::") and skip the constant and the comment (this is a test, so it doesn't have to be super-efficient) In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + StringRef input_expr("This string has (several ) ::identifiers"); + llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = + lldb_private::dil::DILLexer::Create(input_expr); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + lldb_private::dil::DILLexer lexer(*maybe_lexer); + lldb_private::dil::Token token = + lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); + + std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>> + lexer_tokens_data = ExtractTokenData(lexer); + + EXPECT_THAT( + lexer_tokens_data, + testing::ElementsAre( + testing::Pair(lldb_private::dil::Token::identifier, "This"), + testing::Pair(lldb_private::dil::Token::identifier, "string"), + testing::Pair(lldb_private::dil::Token::identifier, "has"), + testing::Pair(lldb_private::dil::Token::l_paren, "("), + testing::Pair(lldb_private::dil::Token::identifier, "several"), + testing::Pair(lldb_private::dil::Token::r_paren, ")"), + testing::Pair(lldb_private::dil::Token::coloncolon, "::"), + testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), + testing::Pair(lldb_private::dil::Token::eof, ""))); + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetSpelling(), ""); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof); ⬇️ Suggested change - StringRef input_expr("This string has (several ) ::identifiers"); - llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = - lldb_private::dil::DILLexer::Create(input_expr); - ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); - lldb_private::dil::DILLexer lexer(*maybe_lexer); - lldb_private::dil::Token token = - lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); - - std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>> - lexer_tokens_data = ExtractTokenData(lexer); - - EXPECT_THAT( - lexer_tokens_data, - testing::ElementsAre( - testing::Pair(lldb_private::dil::Token::identifier, "This"), - testing::Pair(lldb_private::dil::Token::identifier, "string"), - testing::Pair(lldb_private::dil::Token::identifier, "has"), - testing::Pair(lldb_private::dil::Token::l_paren, "("), - testing::Pair(lldb_private::dil::Token::identifier, "several"), - testing::Pair(lldb_private::dil::Token::r_paren, ")"), - testing::Pair(lldb_private::dil::Token::coloncolon, "::"), - testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), - testing::Pair(lldb_private::dil::Token::eof, ""))); - lexer.Advance(); - token = lexer.GetCurrentToken(); - EXPECT_EQ(token.GetSpelling(), ""); - EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof); + EXPECT_THAT_EXPECTED( + ExtractTokenData("This string has (several ) ::identifiers"), + llvm::HasValue(testing::ElementsAre( + testing::Pair(lldb_private::dil::Token::identifier, "This"), + testing::Pair(lldb_private::dil::Token::identifier, "string"), + testing::Pair(lldb_private::dil::Token::identifier, "has"), + testing::Pair(lldb_private::dil::Token::l_paren, "("), + testing::Pair(lldb_private::dil::Token::identifier, "several"), + testing::Pair(lldb_private::dil::Token::r_paren, ")"), + testing::Pair(lldb_private::dil::Token::coloncolon, "::"), + testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), + testing::Pair(lldb_private::dil::Token::eof, "")))); (and have ExtractTokenData construct the lexer internally and return llvm::Expected<std::vector<...>>) I think this will reduce the boiler plate in all subsequent tests. In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = + lldb_private::dil::DILLexer::Create(str); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + lldb_private::dil::DILLexer lexer(*maybe_lexer); + lldb_private::dil::Token token = + lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); For example, this could boil down to this ⬇️ Suggested change - llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = - lldb_private::dil::DILLexer::Create(str); - ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); - lldb_private::dil::DILLexer lexer(*maybe_lexer); - lldb_private::dil::Token token = - lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); - lexer.Advance(); - token = lexer.GetCurrentToken(); - EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); + EXPECT_THAT_EXPECTED(ExtractTokenData(str), llvm::HasValue(testing::ElementsAre(testing::Pair(identifier, str)))); In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + lldb_private::dil::DILLexer::Create(str); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + lldb_private::dil::DILLexer lexer(*maybe_lexer); + lldb_private::dil::Token token = + lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); + } + + // Verify that none of the invalid identifiers come out as identifier tokens. + for (auto &str : invalid_identifiers) { + SCOPED_TRACE(str); + llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = + lldb_private::dil::DILLexer::Create(str); + if (!maybe_lexer) { Control flow in a test is usually a bad sign (among other reasons, because we don't write tests for tests). In gtest matcher language, I think what you're trying to express is EXPECT_THAT_EXPECTED(ExtractTokenData(str), testing::Not(llvm::HasValue(testing::Contains(testing::Pair(Token::identifier, testing::_)))) (i.e., parsing does not succeed and return a token of the identifier kind), though it may be better to be even more specific: instead of saying what you do not want to happen, say what you actually expect to be the result of parsing of e.g. "123". (It's true that this will result will change once you start parsing numbers, but I think churn like that is fine for a unit test and serves as evidence of the functional changes you are making) In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILLexer.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include <string> + +using llvm::StringRef; + +std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>> I'd suggest adding using namespace lldb_private::dil to avoid all of this repetition. In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>: > + testing::Pair(lldb_private::dil::Token::r_paren, ")"), + testing::Pair(lldb_private::dil::Token::coloncolon, "::"), + testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), + testing::Pair(lldb_private::dil::Token::eof, ""))); + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetSpelling(), ""); + EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof); +} + +TEST(DILLexerTests, IdentifiersTest) { + std::vector<std::string> valid_identifiers = { + "$My_name1", "$pc", "abcd", "ab cd", "_", "_a", "_a_", + "a_b", "this", "self", "a", "MyName", "namespace"}; + std::vector<std::string> invalid_identifiers = {"234", "2a", "2", + "$", "1MyName", ""}; I think a lone dollar sign should be a valid identifier, at least until we have a reason for it not to be. The cases with numbers are more interesting, and I'm going to add them to our top-level discussion. In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>: > +#include <string> +#include <vector> + +namespace lldb_private::dil { + +/// Class defining the tokens generated by the DIL lexer and used by the +/// DIL parser. +class Token { +public: + enum Kind { + coloncolon, + eof, + identifier, + l_paren, + r_paren, + unknown, It looks like the only remaining use of the "unknown" token is to construct "empty" tokens in the tests. I think all of those cases could be avoided by just declaring the Token variable later in the code. In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>: > + +/// Class defining the tokens generated by the DIL lexer and used by the +/// DIL parser. +class Token { +public: + enum Kind { + coloncolon, + eof, + identifier, + l_paren, + r_paren, + unknown, + }; + + Token(Kind kind, std::string spelling, uint32_t start) + : m_kind(kind), m_spelling(spelling), m_start_pos(start) {} ⬇️ Suggested change - : m_kind(kind), m_spelling(spelling), m_start_pos(start) {} + : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {} In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>: > +}; + +/// Class for doing the simple lexing required by DIL. +class DILLexer { +public: + /// Lexes all the tokens in expr and calls the private constructor + /// with the lexed tokens. + static llvm::Expected<DILLexer> Create(llvm::StringRef expr); + + /// Return the current token to be handled by the DIL parser. + const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; } + + /// Advance the current token position by N. + void Advance(uint32_t N = 1) { + // UINT_MAX means uninitialized, no "current" position, so move to start. + if (m_tokens_idx == UINT_MAX) Could we just have the lexer start at token zero straight away? In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>: > + if (m_tokens_idx == UINT_MAX) + m_tokens_idx = 0; + else if (m_tokens_idx + N >= m_lexed_tokens.size()) + // N is too large; advance to the end of the lexed tokens. + m_tokens_idx = m_lexed_tokens.size() - 1; + else + m_tokens_idx += N; + } + + /// Return the lexed token N positions ahead of the 'current' token + /// being handled by the DIL parser. + const Token &LookAhead(uint32_t N) { + if (m_tokens_idx + N < m_lexed_tokens.size()) + return m_lexed_tokens[m_tokens_idx + N]; + + return m_eof_token; A valid token stream should always end in and EOF token, right? If, so you can just return the last element. In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>: > + /// Return the index for the 'current' token being handled by the DIL parser. + uint32_t GetCurrentTokenIdx() { return m_tokens_idx; } + + /// Set the index for the 'current' token (to be handled by the parser) + /// to a particular position. Used for either committing 'look ahead' parsing + /// or rolling back tentative parsing. + void ResetTokenIdx(uint32_t new_value) { + assert(new_value == UINT_MAX || new_value < m_lexed_tokens.size()); + m_tokens_idx = new_value; + } + + uint32_t NumLexedTokens() { return m_lexed_tokens.size(); } + +private: + DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens) + : m_expr(dil_expr), m_lexed_tokens(lexed_tokens), m_tokens_idx(UINT_MAX), ⬇️ Suggested change - : m_expr(dil_expr), m_lexed_tokens(lexed_tokens), m_tokens_idx(UINT_MAX), + : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)), m_tokens_idx(UINT_MAX), In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>: > + llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); + llvm::StringRef::iterator start = cur_pos; + bool dollar_start = false; + + // Must not start with a digit. + if (cur_pos == expr.end() || IsDigit(*cur_pos)) + return std::nullopt; + + // First character *may* be a '$', for a register name or convenience + // variable. + if (*cur_pos == '$') { + dollar_start = true; + ++cur_pos; + } + + // Contains only letters, digits or underscores + for (; cur_pos != expr.end(); ++cur_pos) { + char c = *cur_pos; + if (!IsLetter(c) && !IsDigit(c) && c != '_') + break; + } + + // If first char is '$', make sure there's at least one mare char, or it's + // invalid. + if (dollar_start && (cur_pos - start <= 1)) { + cur_pos = start; + return std::nullopt; + } + + if (cur_pos == start) + return std::nullopt; + + llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start); + if (remainder.consume_front(word)) + return word; + + return std::nullopt; This should be equivalent to your code, but I'm only suggesting this to show how code like this can be written in a shorter and more stringref-y fashion. I think the actual algorithm will have to change, as it has a very narrow definition of identifiers. ⬇️ Suggested change - llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); - llvm::StringRef::iterator start = cur_pos; - bool dollar_start = false; - - // Must not start with a digit. - if (cur_pos == expr.end() || IsDigit(*cur_pos)) - return std::nullopt; - - // First character *may* be a '$', for a register name or convenience - // variable. - if (*cur_pos == '$') { - dollar_start = true; - ++cur_pos; - } - - // Contains only letters, digits or underscores - for (; cur_pos != expr.end(); ++cur_pos) { - char c = *cur_pos; - if (!IsLetter(c) && !IsDigit(c) && c != '_') - break; - } - - // If first char is '$', make sure there's at least one mare char, or it's - // invalid. - if (dollar_start && (cur_pos - start <= 1)) { - cur_pos = start; - return std::nullopt; - } - - if (cur_pos == start) - return std::nullopt; - - llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start); - if (remainder.consume_front(word)) - return word; - - return std::nullopt; + const char *start = remainder.data(); + remainder.consume_front("$"); // initial '$' is valid + remainder = remainder.drop_while([](char c){ return IsDigit(c) || IsLetter(c) || c=='_'; }); + llvm::StringRef candidate(start, remainder.data()-start); + if (candidate.empty() || candidate == "$" || IsDigit(candidate[0])) + return std::nullopt; + return candidate; In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>: > + llvm::StringRef remainder = expr; + do { + if (llvm::Expected<Token> t = Lex(expr, remainder)) { + tokens.push_back(std::move(*t)); + } else { + return t.takeError(); + } + } while (tokens.back().GetKind() != Token::eof); + return DILLexer(expr, std::move(tokens)); +} + +llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr, + llvm::StringRef &remainder) { + // Skip over whitespace (spaces). + remainder = remainder.ltrim(); + llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); ⬇️ Suggested change - llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); + llvm::StringRef::iterator cur_pos = remainder.begin(); In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>: > + tokens.push_back(std::move(*t)); + } else { + return t.takeError(); + } + } while (tokens.back().GetKind() != Token::eof); + return DILLexer(expr, std::move(tokens)); +} + +llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr, + llvm::StringRef &remainder) { + // Skip over whitespace (spaces). + remainder = remainder.ltrim(); + llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); + + // Check to see if we've reached the end of our input string. + if (remainder.empty() || cur_pos == expr.end()) ⬇️ Suggested change - if (remainder.empty() || cur_pos == expr.end()) + if (remainder.empty()) If the two conditions aren't equivalent, I'd like to know why. In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>: > + if (maybe_word) { + llvm::StringRef word = *maybe_word; + return Token(Token::identifier, word.str(), position); + } ⬇️ Suggested change - if (maybe_word) { - llvm::StringRef word = *maybe_word; - return Token(Token::identifier, word.str(), position); - } + if (maybe_word) + return Token(Token::identifier, maybe_word->str(), position); In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>: > + if (remainder.consume_front(str)) { + cur_pos += strlen(str); + return Token(kind, str, position); + } ⬇️ Suggested change - if (remainder.consume_front(str)) { - cur_pos += strlen(str); - return Token(kind, str, position); - } + if (remainder.consume_front(str)) + return Token(kind, str, position); (cur_pos isn't used after this) — Reply to this email directly, view it on GitHub <#123521 (review)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ADUPVW74P2KD3GANQKHXK6D2NNFDRAVCNFSM6AAAAABVO4RH2WVHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKOBVHE4DIMBWGI>. You are receiving this because you were mentioned.

cmtice · 2025-02-03T05:49:09Z

Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere. And it doesn't allow any non-ascii characters.

I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f) into the expression, then so be it.

The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x is a valid identifier but 123u is not, but I suspect some might have a different opinion.

We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you..

To the best of my knowledge, all the languages that we want to support have roughly the same definition of what a valid identifier is: a letter or underscore, followed by a sequence of letters, digits and underscores, where 'letters' are defined as 'a..z' and 'A..Z'. The one's I've been able to check do not allow arbitrary characters in their identifiers. So that's what I implemented (acknowledging that I currently only recognize ascii at the moment, and fully plan to add utf8 support in the future). I added the ability to recognize the '$' at the front specifically to allow DIL users to ask about registers and LLDB convenience variables, which (to the best of my knowledge) allow '$' only in the first position, and not all by itself.

I am not sure I see that benefits of expanding what DIL recognizes as a valid identifier beyond what the languages LLDB supports recognize? Am I missing something? Or (this is quite possible) have I misunderstood the definition of what's a valid identifier for some language we want to support?

Since we definitely want to support lexing/parsing of numbers, I do not think it's a good idea for DIL to also allow identifiers to start with numbers.

- Use std::move on std::string & std::vector in constructor initializers. - Remove some unnecessary code. - Update ExtractTokenData (helper function in unit tests) to set up the lexer and to the lexing inside the function; return an llvm::Expected value. - Add 'using namespace lldb_private::dil;' to unit tests; clean up tests accordingly. - Minor code cleanups in the unit tests.

werat · 2025-02-03T11:18:16Z

I went through the thread to understand the current consensus and I see various nice ideas flying around. Let me try to summarize the requirements and the current status quo.

DIL should support synthetic children, whose names can be arbitrary in general case, even a+b*c
A common synthetic name is [0], which is used for children of vectors/maps and people want to write print vec[0]
- frame variable supports this by having special support for [] "expressions"
We want DIL to be easy & convenient to use in most (simple) cases, but also to be able to support complicated cases and it doesn't have to be super convenient for those

Possible behaviour for DIL:

Make the definition of identifier in Lexer to roughly match C or similar
Introduce escaping of identifiers, e.g. with backticks
- The expression foo->`a*b.c`+1 is parsed as approx foo.GetChildWithName("a*b.c") + 1
Add special support for [] in the parser
- The expression foo.`[1]` is parsed as foo.GetChildWithName("[1]")
- The expression foo[1] tries the following:
  - foo.GetChildWithName("1")
  - foo.GetChildWithName("[1]")
  - foo.GetChildAtIndex(1)
- The expression foo["bar"] tries:
  - foo.GetChildWithName("bar")
  - foo.GetChildWithName("[bar]")
- The expression foo[<expr>] -- expr is evaluated and treated as cases above. If the result of expr is not a number or a string -- produce an error

For cases where the GetChildWithName( "1" / "[1]" ) and GetChildAtIndex(1) produce different valid results we could shows a warning/error to make it clear to the user that there's some ambiguity. IMO this would be an improvement over the current situation where print and expr simply produce different results.

This way all typical cases work as expected by users and more or less match the current behaviour of frame variable. It's still possible to access synthetic children with any names, but it requires knowledge about the escaping.

Does this sound like a reasonable approach?

labath · 2025-02-03T11:39:28Z

To the best of my knowledge, all the languages that we want to support have roughly the same definition of what a valid identifier is: a letter or underscore, followed by a sequence of letters, digits and underscores, where 'letters' are defined as 'a..z' and 'A..Z'. The one's I've been able to check do not allow arbitrary characters in their identifiers. So that's what I implemented (acknowledging that I currently only recognize ascii at the moment, and fully plan to add utf8 support in the future). I added the ability to recognize the '$' at the front specifically to allow DIL users to ask about registers and LLDB convenience variables, which (to the best of my knowledge) allow '$' only in the first position, and not all by itself.

I don't know how you were checking that, but I'm certain that's not the case. I already gave you an example of C code which contradicts all of these (note that there can be difference between what's considered a valid identifier by the specification of a language, and what an actual compiler for that language will accept). And I'm not even mentioning all of the names that can be constructed by synthetic child providers.

You say you want to add utf-8 support. How do you intend to do that? Do you want to enumerate all of the supported characters in each language? Check which language supports variable names in Klingon? Some of the rules can be really obscure. For example, Java accepts £ (\xA3) as a variable name, but not © (\xa9). I'm sure they had some reason to choose that, but I'd rather not have to find that out.

OTOH, if you just accept all of the high-bit ascii values as valid characters, then you can support utf8 with a single line of code. And you're not regressing anything because that's exactly what the current implementation does.

I don't think this list has to be set in stone. For example, frame variable currently accepts @ as a variable name. I believe you don't have any plans for that operator, so I'd just stick to that. If we can come up with some fancy use for it (maybe as an escape character?), then I'm certainly open to changing its classification.

I am not sure I see that benefits of expanding what DIL recognizes as a valid identifier beyond what the languages LLDB supports recognize?

For me the main benefits are:

simplicity of the implementation
being able to express a wide range of variable names, even for languages we don't support right now
matching status quo

That said, I would like to hear what you think are the benefits of not recognizing wider set of identifier values. And I'm not talking about names like 123foo (it sounds like there's consensus to ban those). I'm thinking more of names like $, foo$, 💩, etc.

labath · 2025-02-03T12:55:13Z

Thanks for the summary Andy.

I went through the thread to understand the current consensus and I see various nice ideas flying around. Let me try to summarize the requirements and the current status quo.

DIL should support synthetic children, whose names can be arbitrary in general case, even a+b*c

A common synthetic name is [0], which is used for children of vectors/maps and people want to write print vec[0]

frame variable supports this by having special support for [] "expressions"

We want DIL to be easy & convenient to use in most (simple) cases, but also to be able to support complicated cases and it doesn't have to be super convenient for those

SGTM

Possible behaviour for DIL:

Make the definition of identifier in Lexer to roughly match C or similar

Introduce escaping of identifiers, e.g. with backticks

The expression foo->`a*b.c`+1 is parsed as approx foo.GetChildWithName("a*b.c") + 1

I already spoke at length about identifier names. Quoting of fancy names SGTM. I don't think its relevant for this patch (lexing), but since you're also mentioning the wider syntax of the language, I want to mention that there's also another kind of disambiguation to consider. Forcing a string to be treated as an identifier is one thing. Another question is forcing an identifier to be treated as a specific kind of entity. For example, if there's a variable and a type with the same name, can we say which one we mean? Or can we say we want to see the value of a global variable foo in a specific compile unit? Or in an outer scope that's shadowed by another variable?

We don't exactly support that right now, but e.g. target variable foo will print all global variables with that names. That gets trickier with a more complicated parser, because how do you print the result of global1+global2 if there are multiple candidates for each name

* Add special support for `[]` in the parser
  
  * The expression `` foo.`[1]`  `` is parsed as `foo.GetChildWithName("[1]")`
  * The expression `foo[1]` tries the following:
    
    * `foo.GetChildWithName("1")`
    * `foo.GetChildWithName("[1]")`
    * `foo.GetChildAtIndex(1)`

I'd go with just the second option because I'd like to avoid ambiguities and fallbacks. I think that's more or less the status quo, at least for the types with child providers. We'd need some special handling for C pointers and arrays, but that's also what we already have

  * The expression `foo["bar"]` tries:
    
    * `foo.GetChildWithName("bar")`
    * `foo.GetChildWithName("[bar]")`
  * The expression `foo[<expr>]` -- `expr` is evaluated and treated as cases above. If the result of `expr` is not a number or a string -- produce an error

I think this is an interesting idea whose usefulness mainly depends on what kind of string operations we have in the language. If there's no way to construct strings, then I don't think it's very useful. You could write things like some_variable[another_variable], but I don't think that's going to be useful because for static types (e.g. C structs) you're unlikely to have a variable with a value that contains the name of a field, and for types with synthetic children the names of the children are not going to have any relation to the way it's seen by the code (map<string, string> is still going to have a child called [0]).

Overall I'd leave this out for the time being because it doesn't impact parsing or lexing, just how some (currently invalid) syntax trees can be interpreted.

For cases where the GetChildWithName( "1" / "[1]" ) and GetChildAtIndex(1) produce different valid results we could shows a warning/error to make it clear to the user that there's some ambiguity. IMO this would be an improvement over the current situation where print and expr simply produce different results.

I'm not sure how this is going to help. I assume you're referring to the map<int, int> scenario. In this case, the map object is not going to have a child called "1", even if it happens to contain a key with the value 1. (Depending on the other keys, the name of the child containing it could be "[0]", "[1]", or "[47]"). Or are you proposing to change that?

jimingham · 2025-02-03T18:02:03Z

On Feb 2, 2025, at 9:49 PM, cmtice ***@***.***> wrote: Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere <https://godbolt.org/z/o7qbfeWve>. And it doesn't allow any non-ascii characters. I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f) into the expression, then so be it. The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x is a valid identifier but 123u is not, but I suspect some might have a different opinion. We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you.. To the best of my knowledge, all the languages that we want to support have roughly the same definition of what a valid identifier is: a letter or underscore, followed by a sequence of letters, digits and underscores, where 'letters' are defined as 'a..z' and 'A..Z'. The one's I've been able to check do not allow arbitrary characters in their identifiers. So that's what I implemented (acknowledging that I currently only recognize ascii at the moment, and fully plan to add utf8 support in the future). I added the ability to recognize the '$' at the front specifically to allow DIL users to ask about registers and LLDB convenience variables, which (to the best of my knowledge) allow '$' only in the first position, and not all by itself. I am not sure I see that benefits of expanding what DIL recognizes as a valid identifier beyond what the languages LLDB supports recognize? Am I missing something? Or (this is quite possible) have I misunderstood the definition of what's a valid identifier for some language we want to support? Since we definitely want to support lexing/parsing of numbers, I do not think it's a good idea for DIL to also allow identifiers to start with numbers.

I agree here. We definitely will need to support UTF-8 characters, all the hip new languages use that character set. But allowing initial digits makes parsing sufficiently hard I don't think it likely there will be languages we need to support that do that. Can somebody even think of a language that allows this? Jim

…

— Reply to this email directly, view it on GitHub <#123521 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ADUPVW36H3ZJSE2MRP6OPY32N37O5AVCNFSM6AAAAABVO4RH2WVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDMMZQGAZTGMJXGI>. You are receiving this because you were mentioned.

jimingham · 2025-02-03T18:09:40Z

To the best of my knowledge, all the languages that we want to support have roughly the same definition of what a valid identifier is: a letter or underscore, followed by a sequence of letters, digits and underscores, where 'letters' are defined as 'a..z' and 'A..Z'. The one's I've been able to check do not allow arbitrary characters in their identifiers. So that's what I implemented (acknowledging that I currently only recognize ascii at the moment, and fully plan to add utf8 support in the future). I added the ability to recognize the '$' at the front specifically to allow DIL users to ask about registers and LLDB convenience variables, which (to the best of my knowledge) allow '$' only in the first position, and not all by itself.

I don't know how you were checking that, but I'm certain that's not the case. I already gave you an example of C code which contradicts all of these (note that there can be difference between what's considered a valid identifier by the specification of a language, and what an actual compiler for that language will accept). And I'm not even mentioning all of the names that can be constructed by synthetic child providers.

You say you want to add utf-8 support. How do you intend to do that? Do you want to enumerate all of the supported characters in each language? Check which language supports variable names in Klingon? Some of the rules can be really obscure. For example, Java accepts £ (\xA3) as a variable name, but not © (\xa9). I'm sure they had some reason to choose that, but I'd rather not have to find that out.

OTOH, if you just accept all of the high-bit ascii values as valid characters, then you can support utf8 with a single line of code. And you're not regressing anything because that's exactly what the current implementation does.

I don't think this list has to be set in stone. For example, frame variable currently accepts @ as a variable name. I believe you don't have any plans for that operator, so I'd just stick to that. If we can come up with some fancy use for it (maybe as an escape character?), then I'm certainly open to changing its classification.

I am not sure I see that benefits of expanding what DIL recognizes as a valid identifier beyond what the languages LLDB supports recognize?

For me the main benefits are:

simplicity of the implementation

being able to express a wide range of variable names, even for languages we don't support right now

matching status quo

That said, I would like to hear what you think are the benefits of not recognizing wider set of identifier values. And I'm not talking about names like 123foo (it sounds like there's consensus to ban those). I'm thinking more of names like $, foo$, 💩, etc.

There's one small area that our desire to parse the widest range of identifiers comes into conflict with, namely the "persistent variable" namespace. For the C family of languages, we reserve the initial $ for this purpose, for swift (where $1 etc. were already taken) we use $R as the namespace identifier. This doesn't matter for the DIL at present, because frame var and the like don't have access to the persistent variables, but I don't think in the long run that's desirable. But it looks like to get that right generally we would have to figure out the language of the identifier which we don't want to do. I'm not sure how to finesse this (other than to keep not allowing access to persistent variables in the DIL...)

- Remove 'unknown' token type. - Remove UINT_MAX as a valid token index; always start at 0. - Update IsWord to use Pavel's more efficient StringRef implementation. - Allow '$' anywhere in an identifer string. - Adjust unit tests to handle changes mentioned above. - Clean up test of invalid identifiers (remove if-then-else; split invalid identifiers from unrecognized strings).

cmtice · 2025-02-03T20:42:00Z

I've addressed all the remaining review comments (I think). Hopefully this is ready for approval now? :-)

labath · 2025-02-04T14:35:41Z

There's one small area that our desire to parse the widest range of identifiers comes into conflict with, namely the "persistent variable" namespace. For the C family of languages, we reserve the initial $ for this purpose, for swift (where $1 etc. were already taken) we use $R as the namespace identifier. This doesn't matter for the DIL at present, because frame var and the like don't have access to the persistent variables, but I don't think in the long run that's desirable. But it looks like to get that right generally we would have to figure out the language of the identifier which we don't want to do. I'm not sure how to finesse this (other than to keep not allowing access to persistent variables in the DIL...)

Yeah, access to persistent variables is one of the first things that I'd add to the "frame var" language. It's true that this can create some conflicts and, but I think it we don't need to handle those at the lexer level. And variables with dollars in their names are currently accepted in frame var:

   2   	  int a$b = 47;
-> 3   	  return a$b;
   4   	}
(lldb) v a$b
(int) a$b = 47

One way to handle this ambiguity would be to use the (imaginary) identifier disambiguation/scoping mechanism I alluded to in one of my previous comments. We could e.g. say that identifiers that start with a dollar are reserved by lldb, and if you want to access a variable with that name, you do something like $locals.$47.

The fact that swift uses identifiers like $1 does complicate things, and I guess it means we may have to have some context-sensitive rules about when does that refer to a program variable and when an lldb internal concept.

labath · 2025-02-04T14:51:09Z

The expression foo["bar"] tries:

foo.GetChildWithName("bar")

foo.GetChildWithName("[bar]")

The expression foo[<expr>] -- expr is evaluated and treated as cases above. If the result of expr is not a number or a string -- produce an error

I think this is an interesting idea whose usefulness mainly depends on what kind of string operations we have in the language. If there's no way to construct strings, then I don't think it's very useful. You could write things like some_variable[another_variable], but I don't think that's going to be useful because for static types (e.g. C structs) you're unlikely to have a variable with a value that contains the name of a field, and for types with synthetic children the names of the children are not going to have any relation to the way it's seen by the code (map<string, string> is still going to have a child called [0]).

Overall I'd leave this out for the time being because it doesn't impact parsing or lexing, just how some (currently invalid) syntax trees can be interpreted.

Thinking about this further, I can think of one reason to allow this even without sophisticated string operations -- because this allows you to access identifiers with weird child names. This way we could have users write obj["a+b"] for accessing the child a+b of object obj -- and we wouldn't "use up" a quote character in the process. This is sort of like how you can access global variables (even those with weird names) with globals()["my_var"] in python.

I'd still try to avoid things like guessing whether the user forgot to add [] to the name or not..

labath · 2025-02-04T14:53:21Z

lldb/include/lldb/ValueObject/DILLexer.h

+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_


I believe our headers don't have the ending underscore.

Suggested change

#ifndef LLDB_VALUEOBJECT_DILLEXER_H_

#define LLDB_VALUEOBJECT_DILLEXER_H_

#ifndef LLDB_VALUEOBJECT_DILLEXER_H

#define LLDB_VALUEOBJECT_DILLEXER_H

labath · 2025-02-04T14:55:51Z

lldb/include/lldb/ValueObject/DILLexer.h

+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <limits.h>


It fits in nicer with the other headers (though maybe you don't need it if you remove the UINT_MAX thing below)

Suggested change

#include <limits.h>

#include <climits>

labath · 2025-02-04T14:57:44Z

lldb/include/lldb/ValueObject/DILLexer.h

+    identifier,
+    l_paren,
+    r_paren,
+    unknown,


The unknown token is still here. Are you sure you uploaded the right version of the patch?

labath · 2025-02-04T14:58:22Z

lldb/include/lldb/ValueObject/DILLexer.h

+  /// Advance the current token position by N.
+  void Advance(uint32_t N = 1) {
+    // UINT_MAX means uninitialized, no "current" position, so move to start.
+    if (m_tokens_idx == UINT_MAX)


And I'd hope that all of the UINT_MAX business can go away if we really start at token zero.

labath · 2025-02-04T14:59:10Z

lldb/include/lldb/ValueObject/DILLexer.h

+
+  // "eof" token; to be returned by lexer when 'look ahead' fails.
+  Token m_eof_token;
+};


Suggested change

// "eof" token; to be returned by lexer when 'look ahead' fails.

Token m_eof_token;

};

};

labath

This looks good now, thanks for your patience.

labath · 2025-02-05T08:36:28Z

lldb/unittests/ValueObject/DILLexerTests.cpp

+  if (lexer.NumLexedTokens() == 0)
+    return llvm::createStringError("No lexed tokens");
+
+  lexer.ResetTokenIdx(0);


I think this is unnecessary now.

…vm#123521) This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.

cmtice requested a review from JDevlieghere as a code owner January 19, 2025 17:21

llvmbot added the lldb label Jan 19, 2025

[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language)

61a2607

Update CMakeLists.txt to build DILLexer.cpp.

cmtice requested review from adrian-prantl, jimingham, labath and werat January 19, 2025 17:26

cmtice assigned asl and kuilpd Jan 19, 2025

werat reviewed Jan 19, 2025

View reviewed changes

labath reviewed Jan 20, 2025

View reviewed changes

lldb/include/lldb/ValueObject/DILLexer.h Show resolved Hide resolved

labath reviewed Jan 27, 2025

View reviewed changes

labath reviewed Jan 31, 2025

View reviewed changes

labath reviewed Feb 4, 2025

View reviewed changes

cmtice added 4 commits February 4, 2025 08:50

Remove trailing '_' from LLDB_VALUEOBJECT_DILLEXER_H

4103144

Clearing the git cache.

29ad86c

Restore .h files after clearing the cache.

41416de

Remove unneeded include file & unused m_eof_token.

f56c019

labath approved these changes Feb 5, 2025

View reviewed changes

Remove unnecessary code from DILLexerTests.cpp, as requested.

5dbf7d2

cmtice merged commit d9a7498 into llvm:main Feb 5, 2025
5 of 6 checks passed

kuilpd mentioned this pull request Feb 6, 2025

Incorporate DILLexer changes from upstream commit. access-softek/llvm-project-dil#18

Merged

	lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
	lldb_private::dil::DILLexer lexer(input_expr);

	bool Lex(DILToken &result, bool look_ahead = false);
	std::optional<DILToken> Lex(bool look_ahead = false);

		StringRef dil_input_expr(str);
		lldb_private::dil::DILLexer dil_lexer(dil_input_expr);

		Token() : m_kind(Kind::none), m_spelling(""), m_start_pos(0) {}

		void SetKind(Kind kind) { m_kind = kind; }


		llvm::Expected<bool> LexAll();

		/// Return the lexed token N+1 positions ahead of the 'current' token

	EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0);
	EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);

	llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
	llvm::StringRef::iterator cur_pos = remainder.begin();

	if (remainder.empty() \|\| cur_pos == expr.end())
	if (remainder.empty())

		#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
		#define LLDB_VALUEOBJECT_DILLEXER_H_

[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). #123521

[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). #123521

Uh oh!

Conversation

cmtice commented Jan 19, 2025

Uh oh!

llvmbot commented Jan 19, 2025

Uh oh!

github-actions bot commented Jan 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

cmtice commented Jan 26, 2025

Uh oh!

labath left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

cmtice Jan 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

github-actions bot commented Jan 19, 2025 •

edited

Loading

cmtice Jan 28, 2025 •

edited

Loading