@@ -331,6 +331,87 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
331331 }
332332}
333333
334+ /// Normalize the attribute value according to xml specification section 3.3.3
335+ ///
336+ /// https://www.w3.org/TR/xml/#AVNormalize
337+ ///
338+ /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
339+ /// * Sequences of whitespace-like characters are replaced with a single whitespace character
340+ /// * Character and entity references are substituted as defined by the spec
341+ fn normalize_attribute_value ( attr : & [ u8 ] ) -> Cow < [ u8 ] > {
342+ // TODO: character references, entity references, error handling associated with those
343+
344+ #[ derive( PartialEq ) ]
345+ enum ParseState {
346+ Space ,
347+ CDATA ,
348+ }
349+
350+ let is_whitespace_like = |c| matches ! ( c, b'\n' | b'\r' | b'\t' | b' ' ) ;
351+
352+ let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace_like ( * c) ) ;
353+
354+ if first_non_space_char. is_none ( ) {
355+ // The entire value was whitespace-like characters
356+ return Cow :: Borrowed ( b"" ) ;
357+ }
358+
359+ let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace_like ( * c) ) ;
360+
361+ // Trim all whitespace-like characters away from the beginning and end of the attribute value.
362+ let begin = first_non_space_char. unwrap ( ) ;
363+ let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
364+ let trimmed_attr = & attr[ begin..=end] ;
365+
366+ // A new buffer is only created when we encounter a situation that requires it.
367+ let mut normalized: Option < Vec < u8 > > = None ;
368+ // We start on character data because all whitespace-like characters are already trimmed away.
369+ let mut current_state = ParseState :: CDATA ;
370+
371+ // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
372+ // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
373+ // buffer and continue using this buffer.
374+ for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
375+ match ch {
376+ b'\n' | b'\r' | b'\t' | b' ' => match current_state {
377+ ParseState :: Space => match normalized {
378+ Some ( _) => continue ,
379+ None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
380+ } ,
381+ ParseState :: CDATA => {
382+ current_state = ParseState :: Space ;
383+ match normalized. as_mut ( ) {
384+ Some ( buf) => buf. push ( b' ' ) ,
385+ None => {
386+ let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
387+ buf. push ( b' ' ) ;
388+ normalized = Some ( buf) ;
389+ }
390+ }
391+ }
392+ } ,
393+ c @ _ => match current_state {
394+ ParseState :: Space => {
395+ current_state = ParseState :: CDATA ;
396+ if let Some ( normalized) = normalized. as_mut ( ) {
397+ normalized. push ( * c) ;
398+ }
399+ }
400+ ParseState :: CDATA => {
401+ if let Some ( normalized) = normalized. as_mut ( ) {
402+ normalized. push ( * c) ;
403+ }
404+ }
405+ } ,
406+ }
407+ }
408+
409+ match normalized {
410+ Some ( normalized) => Cow :: Owned ( normalized) ,
411+ None => Cow :: Borrowed ( trimmed_attr) ,
412+ }
413+ }
414+
334415impl < ' a > Iterator for Attributes < ' a > {
335416 type Item = Result < Attribute < ' a > > ;
336417 fn next ( & mut self ) -> Option < Self :: Item > {
@@ -355,7 +436,7 @@ impl<'a> Iterator for Attributes<'a> {
355436 ( $key: expr, $val: expr) => {
356437 Some ( Ok ( Attribute {
357438 key: & self . bytes[ $key] ,
358- value: Cow :: Borrowed ( & self . bytes[ $val] ) ,
439+ value: normalize_attribute_value ( & self . bytes[ $val] ) ,
359440 } ) )
360441 } ;
361442 }
@@ -513,4 +594,31 @@ mod tests {
513594 assert_eq ! ( & * a. value, b"ee" ) ;
514595 assert ! ( attributes. next( ) . is_none( ) ) ;
515596 }
597+
598+ #[ test]
599+ fn attribute_value_normalization ( ) {
600+ // empty value
601+ assert_eq ! ( normalize_attribute_value( b"" ) . as_ref( ) , b"" ) ;
602+ // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
603+ assert_eq ! (
604+ normalize_attribute_value( b"\r foo\r bar\t baz\n delta\n " ) . as_ref( ) ,
605+ b"foo bar baz delta"
606+ ) ;
607+ // leading and trailing spaces must be stripped
608+ assert_eq ! ( normalize_attribute_value( b" foo " ) . as_ref( ) , b"foo" ) ;
609+ // leading space
610+ assert_eq ! ( normalize_attribute_value( b" bar" ) . as_ref( ) , b"bar" ) ;
611+ // trailing space
612+ assert_eq ! ( normalize_attribute_value( b"baz " ) . as_ref( ) , b"baz" ) ;
613+ // sequences of spaces must be replaced with a single space
614+ assert_eq ! (
615+ normalize_attribute_value( b" foo bar baz " ) . as_ref( ) ,
616+ b"foo bar baz"
617+ ) ;
618+ // sequence replacement mixed with characters treated as whitespace (\t \r \n)
619+ assert_eq ! (
620+ normalize_attribute_value( b" \t foo\t bar \r baz \n \n delta\n \t \r echo foxtrot\r " ) . as_ref( ) ,
621+ b"foo bar baz delta echo foxtrot"
622+ ) ;
623+ }
516624}
0 commit comments