@@ -99,6 +99,48 @@ impl TlsIdentity {
9999 }
100100}
101101
102+ const RESPONSE_PROPAGATION_INTERVAL : Duration = Duration :: from_millis ( 100 ) ;
103+
104+ /// Buffers response broadcasts so they can be propagated at a set interval instead of every time
105+ /// something has changed.
106+ #[ derive( Debug , Default ) ]
107+ struct ResponseBroadcastPropagationBuffer {
108+ changed_resources : std:: sync:: Mutex < std:: collections:: HashSet < String > > ,
109+ }
110+
111+ impl ResponseBroadcastPropagationBuffer {
112+ /// Ingest a resource broadcast message
113+ fn ingest (
114+ & self ,
115+ result : Result < & str , tokio:: sync:: broadcast:: error:: RecvError > ,
116+ ) -> Result < ( ) , tokio:: sync:: broadcast:: error:: RecvError > {
117+ result. map ( |resource| {
118+ let mut guard = match self . changed_resources . lock ( ) {
119+ Ok ( guard) => guard,
120+ Err ( poisoned) => {
121+ let guard = poisoned. into_inner ( ) ;
122+ tracing:: warn!( "recovered from poisoned mutex" ) ;
123+ guard
124+ }
125+ } ;
126+ guard. insert ( resource. into ( ) ) ;
127+ } )
128+ }
129+
130+ /// Flush all changed resources and reset the buffer
131+ fn flush ( & self ) -> Vec < String > {
132+ let mut guard = match self . changed_resources . lock ( ) {
133+ Ok ( guard) => guard,
134+ Err ( poisoned) => {
135+ let guard = poisoned. into_inner ( ) ;
136+ tracing:: warn!( "recovered from poisoned mutex" ) ;
137+ guard
138+ }
139+ } ;
140+ guard. drain ( ) . collect ( )
141+ }
142+ }
143+
102144const VERSION_INFO : & str = "9" ;
103145
104146pub struct ControlPlane < C > {
@@ -371,38 +413,45 @@ impl<C: crate::config::Configuration> ControlPlane<C> {
371413 let stream = async_stream:: try_stream! {
372414 yield response;
373415
416+ // Buffer changes so we only propagate at a set and controlled interval. This reduces
417+ // the network load when we have a high rate of change due to high cluster load.
418+ let buffer = ResponseBroadcastPropagationBuffer :: default ( ) ;
419+ let mut lag_amount: u64 = 0 ;
420+ let mut propagation_interval = tokio:: time:: interval( RESPONSE_PROPAGATION_INTERVAL ) ;
421+
374422 loop {
375423 tokio:: select! {
376- // The resource(s) have changed, inform the connected client , but only
424+ // Inform the connected client if any of the resources have changed , but only
377425 // send the changed resources that the client doesn't already have
378- res = rx. recv( ) => {
379- match res {
380- Ok ( rt) => {
381- match responder( None , rt, & mut client_tracker) {
382- Ok ( Some ( res) ) => yield res,
383- Ok ( None ) => { }
384- Err ( error) => {
385- crate :: metrics:: errors_total( KIND_SERVER , "respond" ) . inc( ) ;
386- tracing:: error!( %error, "responder failed to generate response" ) ;
387- continue ;
388- } ,
426+ _ = propagation_interval. tick( ) => {
427+ // Fetch the changed resources
428+ let mut resources = buffer. flush( ) ;
429+ // If we've been lagging on updates, collect everything instead
430+ if lag_amount > 0 {
431+ tracing:: warn!( lag_amount, "lagged while receiving response broadcasts" ) ;
432+ resources = client_tracker. tracked_resources( ) . collect( ) ;
433+ }
434+ lag_amount = 0 ;
435+ for rt in resources {
436+ match responder( None , & rt, & mut client_tracker) {
437+ Ok ( Some ( res) ) => yield res,
438+ Ok ( None ) => { } ,
439+ Err ( error) => {
440+ crate :: metrics:: errors_total( KIND_SERVER , "respond" ) . inc( ) ;
441+ tracing:: error!( %error, "responder failed to generate response" ) ;
442+ continue ;
389443 }
390444 }
445+ }
446+ }
447+ // A resource has changed, buffer it for propagation
448+ res = rx. recv( ) => {
449+ match buffer. ingest( res) {
450+ Ok ( _) => { } ,
391451 Err ( tokio:: sync:: broadcast:: error:: RecvError :: Closed ) => break ,
392- Err ( tokio:: sync:: broadcast:: error:: RecvError :: Lagged ( _) ) => {
393- let tracked_resources: Vec <_> = client_tracker. tracked_resources( ) . collect( ) ;
394- for rt in tracked_resources {
395- match responder( None , & rt, & mut client_tracker) {
396- Ok ( Some ( res) ) => yield res,
397- Ok ( None ) => { } ,
398- Err ( error) => {
399- crate :: metrics:: errors_total( KIND_SERVER , "respond" ) . inc( ) ;
400- tracing:: error!( %error, "responder failed to generate response" ) ;
401- continue ;
402- }
403- }
404- }
405- }
452+ Err ( tokio:: sync:: broadcast:: error:: RecvError :: Lagged ( amount) ) => {
453+ lag_amount += amount;
454+ } ,
406455 }
407456 }
408457 client_request = streaming. next( ) => {
@@ -712,6 +761,7 @@ impl<C: crate::config::Configuration> AggregatedControlPlaneDiscoveryService for
712761 cs
713762 } else {
714763 let Some ( cs) = client_tracker. get_state ( type_url) else {
764+ tracing:: trace!( type_url, "no client state" ) ;
715765 return Ok ( None ) ;
716766 } ;
717767
@@ -724,6 +774,7 @@ impl<C: crate::config::Configuration> AggregatedControlPlaneDiscoveryService for
724774 . map_err ( |error| tonic:: Status :: internal ( error. to_string ( ) ) ) ?;
725775
726776 if req. resources . is_empty ( ) && req. removed . is_empty ( ) {
777+ tracing:: trace!( type_url, "no resources and nothing removed" ) ;
727778 return Ok ( None ) ;
728779 }
729780
@@ -796,36 +847,45 @@ impl<C: crate::config::Configuration> AggregatedControlPlaneDiscoveryService for
796847 let stream = async_stream:: try_stream! {
797848 yield response;
798849
850+ // Buffer changes so we only propagate at a set and controlled interval. This reduces
851+ // the network load when we have a high rate of change due to high cluster load.
852+ let buffer = ResponseBroadcastPropagationBuffer :: default ( ) ;
853+ let mut lag_amount: u64 = 0 ;
854+ let mut propagation_interval = tokio:: time:: interval( RESPONSE_PROPAGATION_INTERVAL ) ;
855+
799856 loop {
800857 tokio:: select! {
801- // The resource(s) have changed, inform the connected client , but only
858+ // Inform the connected client if any of the resources have changed , but only
802859 // send the changed resources that the client doesn't already have
803- res = rx. recv( ) => {
804- match res {
805- Ok ( rt) => {
806- match responder( None , rt, & mut client_tracker) {
807- Ok ( Some ( res) ) => yield res,
808- Ok ( None ) => { }
809- Err ( error) => {
810- tracing:: error!( %error, "responder failed to generate response" ) ;
811- continue ;
812- } ,
860+ _ = propagation_interval. tick( ) => {
861+ // Fetch the changed resources
862+ let mut resources = buffer. flush( ) ;
863+ // If we've been lagging on updates, collect everything instead
864+ if lag_amount > 0 {
865+ tracing:: warn!( lag_amount, "lagged while receiving response broadcasts" ) ;
866+ resources = client_tracker. tracked_resources( ) . collect( ) ;
867+ }
868+ lag_amount = 0 ;
869+ for rt in resources {
870+ match responder( None , & rt, & mut client_tracker) {
871+ Ok ( Some ( res) ) => yield res,
872+ Ok ( None ) => { } ,
873+ Err ( error) => {
874+ crate :: metrics:: errors_total( KIND_SERVER , "respond" ) . inc( ) ;
875+ tracing:: error!( %error, "responder failed to generate response" ) ;
876+ continue ;
813877 }
814878 }
879+ }
880+ }
881+ // A resource has changed, buffer it for propagation
882+ res = rx. recv( ) => {
883+ match buffer. ingest( res) {
884+ Ok ( _) => { } ,
815885 Err ( tokio:: sync:: broadcast:: error:: RecvError :: Closed ) => break ,
816- Err ( tokio:: sync:: broadcast:: error:: RecvError :: Lagged ( _) ) => {
817- let tracked_resources: Vec <_> = client_tracker. tracked_resources( ) . collect( ) ;
818- for rt in tracked_resources {
819- match responder( None , & rt, & mut client_tracker) {
820- Ok ( Some ( res) ) => yield res,
821- Ok ( None ) => { } ,
822- Err ( error) => {
823- tracing:: error!( %error, "responder failed to generate response" ) ;
824- continue ;
825- }
826- }
827- }
828- }
886+ Err ( tokio:: sync:: broadcast:: error:: RecvError :: Lagged ( amount) ) => {
887+ lag_amount += amount;
888+ } ,
829889 }
830890 }
831891 client_request = requests. next( ) => {
@@ -839,6 +899,7 @@ impl<C: crate::config::Configuration> AggregatedControlPlaneDiscoveryService for
839899 } ;
840900
841901 if client_request. type_url == "ignore-me" {
902+ tracing:: trace!( "ignore-me received, continuing" ) ;
842903 continue ;
843904 }
844905
@@ -866,6 +927,7 @@ impl<C: crate::config::Configuration> AggregatedControlPlaneDiscoveryService for
866927 let type_url = client_request. type_url. clone( ) ;
867928
868929 let Some ( response) = responder( Some ( client_request) , & type_url, & mut client_tracker) . unwrap( ) else { continue ; } ;
930+ tracing:: trace!( kind = type_url, nonce = response. nonce, "yielding response" ) ;
869931 yield response;
870932 }
871933 _ = shutdown. changed( ) => {
0 commit comments