Skip to content

Sentry Nodes lagging behind validator in consensus process #2430

@mazzy89

Description

@mazzy89

Sentry Nodes laggind behind validator in consensus process

Description

I have configured a network having 2 Sentry nodes and 1 validator. Beyond the default values and the values necessary to set up a node as a Sentry, here a copy of the config.toml for a Sentry node:

    # Mechanism to connect to the ABCI application: socket | grpc
    abci = "socket"

    # Database backend: goleveldb | boltdb
    # * goleveldb (github.com/syndtr/goleveldb - most popular implementation)
    #  - pure go
    #  - stable
    #* boltdb (uses etcd's fork of bolt - go.etcd.io/bbolt)
    #  - EXPERIMENTAL
    #  - may be faster is some use-cases (random reads - indexer)
    #  - use boltdb build tag (go build -tags boltdb)
    db_backend = "goleveldb"

    # Database directory
    db_dir = "db"

    # If this node is many blocks behind the tip of the chain, FastSync
    # allows them to catchup quickly by downloading blocks in parallel
    # and verifying their commits
    fast_sync = true

    # If true, query the ABCI app on connecting to a new peer
    # so the app can decide if we should keep the connection or not
    filter_peers = false
    home = ""

    # A custom human readable name for this node
    moniker = "sen1"

    # Path to the JSON file containing the private key to use for node authentication in the p2p protocol
    node_key_file = "secrets/node_key.json"

    # Path to the JSON file containing the private key to use as a validator in the consensus protocol
    priv_validator_key_file = "secrets/priv_validator_key.json"

    # TCP or UNIX socket address for Tendermint to listen on for
    # connections from an external PrivValidator process
    priv_validator_laddr = ""

    # Path to the JSON file containing the last sign state of a validator
    priv_validator_state_file = "priv_validator_state.json"

    # TCP or UNIX socket address for the profiling server to listen on
    prof_laddr = ""

    # TCP or UNIX socket address of the ABCI application,
    # or the name of an ABCI application compiled in with the Tendermint binary
    proxy_app = "tcp://127.0.0.1:26658"

    ##### consensus configuration options #####
    [consensus]

      # EmptyBlocks mode and possible interval between empty blocks
      create_empty_blocks = true
      create_empty_blocks_interval = "0s"
      home = ""

      # Reactor sleep duration parameters
      peer_gossip_sleep_duration = "100ms"
      peer_query_maj23_sleep_duration = "2s"

      # Make progress as soon as we have all the precommits (as if TimeoutCommit = 0)
      skip_timeout_commit = false
      timeout_commit = "1s"
      timeout_precommit = "1s"
      timeout_precommit_delta = "500ms"
      timeout_prevote = "1s"
      timeout_prevote_delta = "500ms"
      timeout_propose = "3s"
      timeout_propose_delta = "500ms"
      wal_file = "wal/cs.wal/wal"

    ##### mempool configuration options #####
    [mempool]
      broadcast = true

      # Size of the cache (used to filter transactions we saw earlier) in transactions
      cache_size = 10000
      home = ""

      # Limit the total size of all txs in the mempool.
      # This only accounts for raw transactions (e.g. given 1MB transactions and
      # max_txs_bytes=5MB, mempool will only accept 5 transactions).
      max_pending_txs_bytes = 1073741824
      recheck = true

      # Maximum number of transactions in the mempool
      size = 5000
      wal_dir = ""

    ##### peer to peer configuration options #####
    [p2p]

      # Toggle to disable guard against peers connecting from the same ip.
      allow_duplicate_ip = false
      dial_timeout = "3s"

      # Address to advertise to peers for them to dial
      # If empty, will use the same port as the laddr,
      # and will introspect on the listener or use UPnP
      # to figure out the address.
      external_address = ""

      # Time to wait before flushing messages out on the connection
      flush_throttle_timeout = "10ms"

      # Peer connection configuration.
      handshake_timeout = "20s"
      home = ""

      # Address to listen for incoming connections
      laddr = "tcp://gnodevx-gnoland-sen1-0:26656"

      # Maximum number of inbound peers
      max_num_inbound_peers = 40

      # Maximum number of outbound peers to connect to, excluding persistent peers
      max_num_outbound_peers = 10

      # Maximum size of a message packet payload, in bytes
      max_packet_msg_payload_size = 10240

      # Comma separated list of nodes to keep persistent connections to
      persistent_peers = "g10kvhns4t8a49vvc8uk2rrgafk653ynx8qs4h98@gnodevx-gnoland-val1-headless.gnoland:26656,g1phpp92d4a60376yr4vpfff2q4a9gh4m8yf09hr@gnodevx-gnoland-sen2-headless.gnoland:26656"

      # Set true to enable the peer-exchange reactor
      pex = true

      # Comma separated list of peer IDs to keep private (will not be gossiped to other peers)
      private_peer_ids = "g10kvhns4t8a49vvc8uk2rrgafk653ynx8qs4h98"

      # Rate at which packets can be received, in bytes/second
      recv_rate = 20000000

      # Seed mode, in which node constantly crawls the network and looks for
      # peers. If another node asks it for addresses, it responds and disconnects.
      #
      # Does not work if the peer-exchange reactor is disabled.
      seed_mode = false

      # Issue: https://github.com/gnolang/gno/issues/2308
      # Comma separated list of seed nodes to connect to
      seeds = "g1phpp92d4a60376yr4vpfff2q4a9gh4m8yf09hr@gnodevx-gnoland-sen2-headless.gnoland:26656"

      # Rate at which packets can be sent, in bytes/second
      send_rate = 20000000
      test_dial_fail = false
      test_fuzz = false

      # UPNP port forwarding
      upnp = false

      [p2p.test_fuzz_config]
        MaxDelay = "3s"
        Mode = 0
        ProbDropConn = 0.0
        ProbDropRW = 0.2
        ProbSleep = 0.0

    ##### rpc server configuration options #####
    [rpc]

      # A list of non simple headers the client is allowed to use with cross-domain requests
      cors_allowed_headers = ["Origin", "Accept", "Content-Type", "X-Requested-With", "X-Server-Time"]

      # A list of methods the client is allowed to use with cross-domain requests
      cors_allowed_methods = ["HEAD", "GET", "POST", "OPTIONS"]

      # A list of origins a cross-domain request can be executed from
      # Default value '[]' disables cors support
      # Use '["*"]' to allow any origin
      cors_allowed_origins = ["*"]

      # TCP or UNIX socket address for the gRPC server to listen on
      # NOTE: This server only supports /broadcast_tx_commit
      grpc_laddr = ""

      # Maximum number of simultaneous connections.
      # Does not include RPC (HTTP&WebSocket) connections. See max_open_connections
      # If you want to accept a larger number than the default, make sure
      # you increase your OS limits.
      # 0 - unlimited.
      # Should be < {ulimit -Sn} - {MaxNumInboundPeers} - {MaxNumOutboundPeers} - {N of wal, db and other open files}
      # 1024 - 40 - 10 - 50 = 924 = ~900
      grpc_max_open_connections = 900
      home = ""

      # TCP or UNIX socket address for the RPC server to listen on
      laddr = "tcp://gnodevx-gnoland-sen1-0:26657"

      # Maximum size of request body, in bytes
      max_body_bytes = 1000000

      # Maximum size of request header, in bytes
      max_header_bytes = 1048576

      # Maximum number of simultaneous connections (including WebSocket).
      # Does not include gRPC connections. See grpc_max_open_connections
      # If you want to accept a larger number than the default, make sure
      # you increase your OS limits.
      # 0 - unlimited.
      # Should be < {ulimit -Sn} - {MaxNumInboundPeers} - {MaxNumOutboundPeers} - {N of wal, db and other open files}
      # 1024 - 40 - 10 - 50 = 924 = ~900
      max_open_connections = 900

      # How long to wait for a tx to be committed during /broadcast_tx_commit.
      # WARNING: Using a value larger than 10s will result in increasing the
      # global HTTP write timeout, which applies to all connections and endpoints.
      # See https://github.com/tendermint/classic/issues/3435
      timeout_broadcast_tx_commit = "10s"

      # The path to a file containing certificate that is used to create the HTTPS server.
      # Might be either absolute path or path related to tendermint's config directory.
      # If the certificate is signed by a certificate authority,
      # the certFile should be the concatenation of the server's certificate, any intermediates,
      # and the CA's certificate.
      # NOTE: both tls_cert_file and tls_key_file must be present for Tendermint to create HTTPS server. Otherwise, HTTP server is run.
      tls_cert_file = ""

      # The path to a file containing matching private key that is used to create the HTTPS server.
      # Might be either absolute path or path related to tendermint's config directory.
      # NOTE: both tls_cert_file and tls_key_file must be present for Tendermint to create HTTPS server. Otherwise, HTTP server is run.
      tls_key_file = ""

      # Activate unsafe RPC commands like /dial_seeds and /unsafe_flush_mempool
      unsafe = false

    ##### node telemetry #####
    [telemetry]
      enabled = true

      # the endpoint to export metrics to, like a local OpenTelemetry collector
      exporter_endpoint = "grafana-k8s-monitoring-alloy.grafana-system.svc.cluster.local:4317"
      meter_name = "gnodevx"
      service_name = "gnodevx"

    ##### event store #####
    [tx_event_store]

      # Type of event store
      event_store_type = "none"

      # Event store parameters
      [tx_event_store.event_store_params]

The Sentry nodes, which run as a full-node, are left behind compared to the validator.

Your environment

  • v1.29.5-gke.1060000
  • 0.1.0-d2d34eb6-nightly
  • branch that causes this issue (with the commit hash)

Steps to reproduce

Spin up a network from scratch crafting a genesis.

  • Spin up 2 Sentry nodes
  • Spin up 1 Validator
  • Observe in the Sentry nodes, the node reporting the error

Expected behaviour

Sentry nodes will commit newer blocks consequently at the validator commiting blocks after consensus.

Actual behaviour

The validator gossipes the transactions and finishes consensus faster than the sentry. The sentry gets the commit but not yet the block so it cannot continue because it does not know the block.

Logs

2024-06-25T09:22:49.346Z	INFO 	Commit is for a block we don't know about. Set ProposalBlock=nil	{"module": "consensus", "height": 69291, "commitRound": 0, "proposal": "", "commit": "XVJ+juEVDEzxz9rhDz9WyiHHnKGhMuwqWUrvY2ob30w="}
2024-06-25T09:22:49.346Z	INFO 	Attempt to finalize failed. We don't have the commit block.	{"module": "consensus", "height": 69291, "proposal-block": "", "commit-block": "XVJ+juEVDEzxz9rhDz9WyiHHnKGhMuwqWUrvY2ob30w="}

Proposed solution

cc @zivkovicmilos

Metadata

Metadata

Assignees

No one assigned

    Labels

    🐞 bugSomething isn't working

    Type

    No type

    Projects

    Status

    Backlog

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions