@@ -33,9 +33,88 @@ pub struct Attribute<'a> {
3333}
3434
3535impl < ' a > Attribute < ' a > {
36+ /// Normalize the attribute value according to xml specification section 3.3.3
3637 ///
38+ /// https://www.w3.org/TR/xml/#AVNormalize
39+ ///
40+ /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
41+ /// * Sequences of whitespace-like characters are replaced with a single whitespace character
42+ /// * Character and entity references are substituted as defined by the spec
3743 pub fn normalized_value ( & ' a self ) -> Result < Cow < ' a , [ u8 ] > , EscapeError > {
38- let normalized = normalize_attribute_value ( self . value . as_ref ( ) ) ;
44+ // TODO: character references, entity references, error handling associated with those
45+
46+ #[ derive( PartialEq ) ]
47+ enum ParseState {
48+ Space ,
49+ CDATA ,
50+ }
51+
52+ // Trim characters from the beginning and end of the attribute value - this can't fail.
53+ fn trim_value ( attr : & [ u8 ] ) -> & [ u8 ] {
54+ let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace ( * c) ) ;
55+
56+ if first_non_space_char. is_none ( ) {
57+ // The entire value was whitespace-like characters
58+ return b"" ;
59+ }
60+
61+ let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace ( * c) ) ;
62+
63+ // Trim all whitespace-like characters away from the beginning and end of the attribute value.
64+ let begin = first_non_space_char. unwrap ( ) ;
65+ let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
66+ & attr[ begin..=end]
67+ }
68+
69+ let trimmed_attr = trim_value ( self . value . as_ref ( ) ) ;
70+
71+ // A new buffer is only created when we encounter a situation that requires it.
72+ let mut normalized: Option < Vec < u8 > > = None ;
73+ // We start on character data because all whitespace-like characters are already trimmed away.
74+ let mut current_state = ParseState :: CDATA ;
75+
76+ // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
77+ // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
78+ // buffer and continue using this buffer.
79+ for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
80+ match ch {
81+ b'\n' | b'\r' | b'\t' | b' ' => match current_state {
82+ ParseState :: Space => match normalized {
83+ Some ( _) => continue ,
84+ None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
85+ } ,
86+ ParseState :: CDATA => {
87+ current_state = ParseState :: Space ;
88+ match normalized. as_mut ( ) {
89+ Some ( buf) => buf. push ( b' ' ) ,
90+ None => {
91+ let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
92+ buf. push ( b' ' ) ;
93+ normalized = Some ( buf) ;
94+ }
95+ }
96+ }
97+ } ,
98+ c @ _ => match current_state {
99+ ParseState :: Space => {
100+ current_state = ParseState :: CDATA ;
101+ if let Some ( normalized) = normalized. as_mut ( ) {
102+ normalized. push ( * c) ;
103+ }
104+ }
105+ ParseState :: CDATA => {
106+ if let Some ( normalized) = normalized. as_mut ( ) {
107+ normalized. push ( * c) ;
108+ }
109+ }
110+ } ,
111+ }
112+ }
113+
114+ let normalized = match normalized {
115+ Some ( normalized) => Cow :: Owned ( normalized) ,
116+ None => Cow :: Borrowed ( trimmed_attr) ,
117+ } ;
39118 let escaped = do_unescape ( & * normalized, None ) ?;
40119 Ok ( Cow :: Owned ( escaped. into_owned ( ) ) )
41120 }
@@ -190,90 +269,6 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
190269 }
191270}
192271
193- /// Normalize the attribute value according to xml specification section 3.3.3
194- ///
195- /// https://www.w3.org/TR/xml/#AVNormalize
196- ///
197- /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
198- /// * Sequences of whitespace-like characters are replaced with a single whitespace character
199- /// * Character and entity references are substituted as defined by the spec
200- fn normalize_attribute_value ( attr : & [ u8 ] ) -> Cow < [ u8 ] > {
201- // TODO: character references, entity references, error handling associated with those
202-
203- #[ derive( PartialEq ) ]
204- enum ParseState {
205- Space ,
206- CDATA ,
207- }
208-
209- // Trim characters from the beginning and end of the attribute value - this can't fail.
210- fn trim_value ( attr : & [ u8 ] ) -> & [ u8 ] {
211- let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace ( * c) ) ;
212-
213- if first_non_space_char. is_none ( ) {
214- // The entire value was whitespace-like characters
215- return b"" ;
216- }
217-
218- let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace ( * c) ) ;
219-
220- // Trim all whitespace-like characters away from the beginning and end of the attribute value.
221- let begin = first_non_space_char. unwrap ( ) ;
222- let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
223- & attr[ begin..=end]
224- }
225-
226- let trimmed_attr = trim_value ( attr) ;
227-
228- // A new buffer is only created when we encounter a situation that requires it.
229- let mut normalized: Option < Vec < u8 > > = None ;
230- // We start on character data because all whitespace-like characters are already trimmed away.
231- let mut current_state = ParseState :: CDATA ;
232-
233- // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
234- // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
235- // buffer and continue using this buffer.
236- for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
237- match ch {
238- b'\n' | b'\r' | b'\t' | b' ' => match current_state {
239- ParseState :: Space => match normalized {
240- Some ( _) => continue ,
241- None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
242- } ,
243- ParseState :: CDATA => {
244- current_state = ParseState :: Space ;
245- match normalized. as_mut ( ) {
246- Some ( buf) => buf. push ( b' ' ) ,
247- None => {
248- let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
249- buf. push ( b' ' ) ;
250- normalized = Some ( buf) ;
251- }
252- }
253- }
254- } ,
255- c @ _ => match current_state {
256- ParseState :: Space => {
257- current_state = ParseState :: CDATA ;
258- if let Some ( normalized) = normalized. as_mut ( ) {
259- normalized. push ( * c) ;
260- }
261- }
262- ParseState :: CDATA => {
263- if let Some ( normalized) = normalized. as_mut ( ) {
264- normalized. push ( * c) ;
265- }
266- }
267- } ,
268- }
269- }
270-
271- match normalized {
272- Some ( normalized) => Cow :: Owned ( normalized) ,
273- None => Cow :: Borrowed ( trimmed_attr) ,
274- }
275- }
276-
277272////////////////////////////////////////////////////////////////////////////////////////////////////
278273
279274/// Iterator over XML attributes.
0 commit comments