11//! Manage xml character escapes
22
3- use memchr;
43use std:: borrow:: Cow ;
54use std:: collections:: HashMap ;
65use std:: ops:: Range ;
76
7+ use jetscii:: bytes;
8+ use memchr;
9+ use once_cell:: sync:: Lazy ;
10+
811#[ cfg( test) ]
912use pretty_assertions:: assert_eq;
1013
14+ static XML_ESCAPE_BYTES : Lazy < jetscii:: BytesConst > =
15+ Lazy :: new ( || bytes ! ( b'<' , b'>' , b'&' , b'\'' , b'"' ) ) ;
16+ static XML_PARTIAL_ESCAPE_BYTES : Lazy < jetscii:: BytesConst > = Lazy :: new ( || bytes ! ( b'<' , b'>' , b'&' ) ) ;
17+
1118/// Error for XML escape/unescqpe.
1219#[ derive( Debug ) ]
1320pub enum EscapeError {
@@ -66,31 +73,17 @@ impl std::error::Error for EscapeError {}
6673/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
6774/// corresponding xml escaped value.
6875pub fn escape ( raw : & [ u8 ] ) -> Cow < [ u8 ] > {
69- #[ inline]
70- fn to_escape ( b : u8 ) -> bool {
71- match b {
72- b'<' | b'>' | b'\'' | b'&' | b'"' => true ,
73- _ => false ,
74- }
75- }
76-
77- _escape ( raw, to_escape)
76+ // _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'\'' | b'&' | b'"'))
77+ simd_escape ( raw, & XML_ESCAPE_BYTES )
7878}
7979
8080/// Should only be used for escaping text content. In xml text content, it is allowed
8181/// (though not recommended) to leave the quote special characters " and ' unescaped.
8282/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
8383/// their corresponding xml escaped value, but does not escape quote characters.
8484pub fn partial_escape ( raw : & [ u8 ] ) -> Cow < [ u8 ] > {
85- #[ inline]
86- fn to_escape ( b : u8 ) -> bool {
87- match b {
88- b'<' | b'>' | b'&' => true ,
89- _ => false ,
90- }
91- }
92-
93- _escape ( raw, to_escape)
85+ // _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
86+ simd_escape ( raw, & XML_PARTIAL_ESCAPE_BYTES )
9487}
9588
9689/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
@@ -112,7 +105,46 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
112105 b'\'' => escaped. extend_from_slice ( b"'" ) ,
113106 b'&' => escaped. extend_from_slice ( b"&" ) ,
114107 b'"' => escaped. extend_from_slice ( b""" ) ,
115- _ => unreachable ! ( "Only '<', '>','\' , '&' and '\" ' are escaped" ) ,
108+ c @ _ => unreachable ! (
109+ "Found {} but only '<', '>', ', '&' and '\" ' are escaped" ,
110+ c as char
111+ ) ,
112+ }
113+ pos = new_pos + 1 ;
114+ }
115+
116+ if let Some ( mut escaped) = escaped {
117+ if let Some ( raw) = raw. get ( pos..) {
118+ escaped. extend_from_slice ( raw) ;
119+ }
120+ Cow :: Owned ( escaped)
121+ } else {
122+ Cow :: Borrowed ( raw)
123+ }
124+ }
125+
126+ /// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
127+ /// corresponding xml escaped value.
128+ pub fn simd_escape < ' a > ( raw : & ' a [ u8 ] , escape_matcher : & jetscii:: BytesConst ) -> Cow < ' a , [ u8 ] > {
129+ let mut escaped = None ;
130+ let mut pos = 0 ;
131+ while let Some ( i) = escape_matcher. find ( & raw [ pos..] ) {
132+ if escaped. is_none ( ) {
133+ escaped = Some ( Vec :: with_capacity ( raw. len ( ) ) ) ;
134+ }
135+ let escaped = escaped. as_mut ( ) . expect ( "initialized" ) ;
136+ let new_pos = pos + i;
137+ escaped. extend_from_slice ( & raw [ pos..new_pos] ) ;
138+ match raw[ new_pos] {
139+ b'<' => escaped. extend_from_slice ( b"<" ) ,
140+ b'>' => escaped. extend_from_slice ( b">" ) ,
141+ b'\'' => escaped. extend_from_slice ( b"'" ) ,
142+ b'&' => escaped. extend_from_slice ( b"&" ) ,
143+ b'"' => escaped. extend_from_slice ( b""" ) ,
144+ c @ _ => unreachable ! (
145+ "Found {} but only '<', '>', ', '&' and '\" ' are escaped" ,
146+ c as char
147+ ) ,
116148 }
117149 pos = new_pos + 1 ;
118150 }
0 commit comments