Skip to content

Commit 1b51aef

Browse files
authored
Berry now accepts 'bytes()' as precompiled patterns, added 're.compilebytes()' (arendst#23149)
1 parent 6486ba3 commit 1b51aef

File tree

5 files changed

+587
-545
lines changed

5 files changed

+587
-545
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file.
1717
- ESP32 enable webcam version 2 (#18732)
1818
- ESP8266 enable FTP for >= 4MB variants (#23120)
1919
- Berry update flasher for Sonoff ZBBridge Pro (#23136)
20+
- Berry `re` now accepts `bytes()` as precompiled patterns, added `re.compilebytes()`
2021

2122
### Fixed
2223
- Berry prevent `import` from hiding a solidified class (#23112)

lib/libesp32/berry/default/be_re_lib.c

+89-43
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,25 @@ int be_re_compile(bvm *vm) {
6464
be_raise(vm, "type_error", NULL);
6565
}
6666

67+
// Native functions be_const_func()
68+
// Berry: `re.compilebytes(pattern:string) -> instance(bytes)`
69+
int be_re_compilebytes(bvm *vm) {
70+
int32_t argc = be_top(vm); // Get the number of arguments
71+
if (argc >= 1 && be_isstring(vm, 1)) {
72+
const char * regex_str = be_tostring(vm, 1);
73+
int sz = re1_5_sizecode(regex_str);
74+
if (sz < 0) {
75+
be_raise(vm, "internal_error", "error in regex");
76+
}
77+
78+
be_pushbytes(vm, NULL, sizeof(ByteProg) + sz);
79+
ByteProg *code = (ByteProg*) be_tobytes(vm, -1, NULL);
80+
re1_5_compilecode(code, regex_str);
81+
be_return(vm);
82+
}
83+
be_raise(vm, "type_error", NULL);
84+
}
85+
6786
// pushes either a list if matched, else `nil`
6887
// return index of next offset, or -1 if not found
6988
const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored, bbool size_only) {
@@ -99,9 +118,10 @@ const char *be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbo
99118

100119
int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
101120
int32_t argc = be_top(vm); // Get the number of arguments
102-
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
103-
const char * regex_str = be_tostring(vm, 1);
121+
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
104122
const char * hay = be_tostring(vm, 2);
123+
ByteProg *code = NULL;
124+
105125
int32_t offset = 0;
106126
if (argc >= 3 && be_isint(vm, 3)) {
107127
offset = be_toint(vm, 3);
@@ -111,49 +131,64 @@ int be_re_match_search(bvm *vm, bbool is_anchored, bbool size_only) {
111131
if (offset >= hay_len) { be_return_nil(vm); } // any match of empty string returns nil, this catches implicitly when hay_len == 0
112132
hay += offset; // shift to offset
113133

114-
int sz = re1_5_sizecode(regex_str);
115-
if (sz < 0) {
116-
be_raise(vm, "internal_error", "error in regex");
117-
}
134+
if (be_isstring(vm, 1)) {
135+
const char * regex_str = be_tostring(vm, 1);
136+
int sz = re1_5_sizecode(regex_str);
137+
if (sz < 0) {
138+
be_raise(vm, "internal_error", "error in regex");
139+
}
118140

119-
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
120-
if (code == NULL) {
121-
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
141+
code = be_os_malloc(sizeof(ByteProg) + sz);
142+
if (code == NULL) {
143+
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
144+
}
145+
int ret = re1_5_compilecode(code, regex_str);
146+
if (ret != 0) {
147+
be_os_free(code);
148+
be_raise(vm, "internal_error", "error in regex");
149+
}
150+
} else {
151+
code = (ByteProg *) be_tobytes(vm, 1, NULL);
122152
}
123-
int ret = re1_5_compilecode(code, regex_str);
124-
if (ret != 0) {
153+
// do the match
154+
be_re_match_search_run(vm, code, hay, is_anchored, size_only);
155+
// cleanup
156+
if (be_isstring(vm, 1)) {
125157
be_os_free(code);
126-
be_raise(vm, "internal_error", "error in regex");
127158
}
128-
be_re_match_search_run(vm, code, hay, is_anchored, size_only);
129-
be_os_free(code);
130159
be_return(vm);
131160
}
132161
be_raise(vm, "type_error", NULL);
133162
}
134163

135164
int be_re_match_search_all(bvm *vm, bbool is_anchored) {
136165
int32_t argc = be_top(vm); // Get the number of arguments
137-
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
138-
const char * regex_str = be_tostring(vm, 1);
166+
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
139167
const char * hay = be_tostring(vm, 2);
168+
ByteProg *code = NULL;
140169
int limit = -1;
141170
if (argc >= 3) {
142171
limit = be_toint(vm, 3);
143172
}
144-
int sz = re1_5_sizecode(regex_str);
145-
if (sz < 0) {
146-
be_raise(vm, "internal_error", "error in regex");
147-
}
148173

149-
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
150-
if (code == NULL) {
151-
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
152-
}
153-
int ret = re1_5_compilecode(code, regex_str);
154-
if (ret != 0) {
155-
be_os_free(code);
156-
be_raise(vm, "internal_error", "error in regex");
174+
if (be_isstring(vm, 1)) {
175+
const char * regex_str = be_tostring(vm, 1);
176+
int sz = re1_5_sizecode(regex_str);
177+
if (sz < 0) {
178+
be_raise(vm, "internal_error", "error in regex");
179+
}
180+
181+
code = be_os_malloc(sizeof(ByteProg) + sz);
182+
if (code == NULL) {
183+
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
184+
}
185+
int ret = re1_5_compilecode(code, regex_str);
186+
if (ret != 0) {
187+
be_os_free(code);
188+
be_raise(vm, "internal_error", "error in regex");
189+
}
190+
} else {
191+
code = (ByteProg *) be_tobytes(vm, 1, NULL);
157192
}
158193

159194
be_newobject(vm, "list");
@@ -165,7 +200,10 @@ int be_re_match_search_all(bvm *vm, bbool is_anchored) {
165200
be_pop(vm, 1);
166201
}
167202
be_pop(vm, 1);
168-
be_os_free(code);
203+
// cleanup
204+
if (be_isstring(vm, 1)) {
205+
be_os_free(code);
206+
}
169207
be_return(vm);
170208
}
171209
be_raise(vm, "type_error", NULL);
@@ -329,29 +367,36 @@ int re_pattern_split(bvm *vm) {
329367
// Berry: `re.split(pattern:string, s:string [, split_limit:int]) -> list(string)`
330368
int be_re_split(bvm *vm) {
331369
int32_t argc = be_top(vm); // Get the number of arguments
332-
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
333-
const char * regex_str = be_tostring(vm, 1);
370+
if (argc >= 2 && (be_isstring(vm, 1) || be_isbytes(vm, 1)) && be_isstring(vm, 2)) {
334371
const char * hay = be_tostring(vm, 2);
372+
ByteProg *code = NULL;
335373
int split_limit = -1;
336374
if (argc >= 3) {
337375
split_limit = be_toint(vm, 3);
338376
}
339-
int sz = re1_5_sizecode(regex_str);
340-
if (sz < 0) {
341-
be_raise(vm, "internal_error", "error in regex");
342-
}
377+
if (be_isstring(vm, 1)) {
378+
const char * regex_str = be_tostring(vm, 1);
379+
int sz = re1_5_sizecode(regex_str);
380+
if (sz < 0) {
381+
be_raise(vm, "internal_error", "error in regex");
382+
}
343383

344-
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
345-
if (code == NULL) {
346-
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
384+
code = be_os_malloc(sizeof(ByteProg) + sz);
385+
if (code == NULL) {
386+
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
387+
}
388+
int ret = re1_5_compilecode(code, regex_str);
389+
if (ret != 0) {
390+
be_os_free(code);
391+
be_raise(vm, "internal_error", "error in regex");
392+
}
393+
} else {
394+
code = (ByteProg *) be_tobytes(vm, 1, NULL);
347395
}
348-
int ret = re1_5_compilecode(code, regex_str);
349-
if (ret != 0) {
396+
int ret = re_pattern_split_run(vm, code, hay, split_limit);
397+
if (be_isstring(vm, 1)) {
350398
be_os_free(code);
351-
be_raise(vm, "internal_error", "error in regex");
352399
}
353-
ret = re_pattern_split_run(vm, code, hay, split_limit);
354-
be_os_free(code);
355400
return ret;
356401
}
357402
be_raise(vm, "type_error", NULL);
@@ -363,6 +408,7 @@ int be_re_split(bvm *vm) {
363408
@const_object_info_begin
364409
module re (scope: global) {
365410
compile, func(be_re_compile)
411+
compilebytes, func(be_re_compilebytes)
366412
search, func(be_re_search)
367413
searchall, func(be_re_search_all)
368414
match, func(be_re_match)

lib/libesp32/berry/tests/re.be

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# test regex from re1.5
2+
import re
3+
4+
# standard use of lib
5+
assert(re.search("a.*?b(z+)", "zaaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
6+
assert(re.searchall('<([a-zA-Z]+)>', '<abc> yeah <xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
7+
8+
assert(re.match("a.*?b(z+)", "aaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
9+
assert(re.match2("a.*?b(z+)", "aaaabbbccbbzzzee") == [14, 'zzz'])
10+
assert(re.matchall('<([a-zA-Z]+)>', '<abc> yeah <xyz>') == [['<abc>', 'abc']])
11+
assert(re.matchall('<([a-zA-Z]+)>', '<abc><xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
12+
assert(re.split('/', "foo/bar//baz") == ['foo', 'bar', '', 'baz'])
13+
14+
# pre-compile
15+
var rr
16+
rr = re.compile("a.*?b(z+)")
17+
assert(rr.search("zaaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
18+
rr = re.compile('<([a-zA-Z]+)>')
19+
assert(rr.searchall('<abc> yeah <xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
20+
21+
rr = re.compile("a.*?b(z+)")
22+
assert(rr.match("aaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
23+
assert(rr.match2("aaaabbbccbbzzzee") == [14, 'zzz'])
24+
rr = re.compile('<([a-zA-Z]+)>')
25+
assert(rr.matchall('<abc> yeah <xyz>') == [['<abc>', 'abc']])
26+
assert(rr.matchall('<abc><xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
27+
rr = re.compile('/')
28+
assert(rr.split("foo/bar//baz") == ['foo', 'bar', '', 'baz'])
29+
30+
# compile to bytes
31+
var rb
32+
rb = re.compilebytes("a.*?b(z+)")
33+
assert(re.search(rb, "zaaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
34+
assert(rb == bytes('1B0000000F0000000100000062030260FB7E00016162030260FB01627E02017A62FC7E037E017F'))
35+
36+
rb = re.compilebytes('<([a-zA-Z]+)>')
37+
assert(re.searchall(rb, '<abc> yeah <xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
38+
assert(rb == bytes('1A0000000C0000000100000062030260FB7E00013C7E020302617A415A62F87E03013E7E017F'))
39+
40+
rb = re.compilebytes("a.*?b(z+)")
41+
assert(re.match(rb, "aaaabbbccbbzzzee") == ['aaaabbbccbbzzz', 'zzz'])
42+
assert(re.match2(rb, "aaaabbbccbbzzzee") == [14, 'zzz'])
43+
assert(rb == bytes('1B0000000F0000000100000062030260FB7E00016162030260FB01627E02017A62FC7E037E017F'))
44+
45+
rb = re.compilebytes('<([a-zA-Z]+)>')
46+
assert(re.matchall(rb, '<abc> yeah <xyz>') == [['<abc>', 'abc']])
47+
assert(re.matchall(rb, '<abc><xyz>') == [['<abc>', 'abc'], ['<xyz>', 'xyz']])
48+
assert(rb == bytes('1A0000000C0000000100000062030260FB7E00013C7E020302617A415A62F87E03013E7E017F'))
49+
50+
rb = re.compilebytes('/')
51+
assert(re.split(rb, "foo/bar//baz") == ['foo', 'bar', '', 'baz'])
52+
assert(rb == bytes('0C000000070000000000000062030260FB7E00012F7E017F'))

lib/libesp32/berry_tasmota/src/embedded/webserver_async.be

+16-19
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,10 @@ class webserver_async
238238
# pre: self.buf_in is not empty
239239
# post: self.buf_in has made progress (smaller or '')
240240
def parse_http_req_line()
241-
var m = global._re_http_srv.match2(self.buf_in, self.buf_in_offset)
241+
import re
242+
# print("parse_http_req_line", "self.buf_in=", self.buf_in)
243+
var m = re.match2(self.server.re_http_srv, self.buf_in, self.buf_in_offset)
244+
# print(f"{m=}")
242245
# Ex: "GET / HTTP/1.1\r\n"
243246
if m
244247
var offset = m[0]
@@ -261,16 +264,18 @@ class webserver_async
261264
#############################################################
262265
# parse incoming headers
263266
def parse_http_headers()
267+
import re
264268
while true
265269
# print("parse_http_headers", "self.buf_in_offset=", self.buf_in_offset)
266-
var m = global._re_http_srv_header.match2(self.buf_in, self.buf_in_offset)
267-
# print("m=", m)
270+
var m = re.match2(self.server.re_http_srv_header, self.buf_in, self.buf_in_offset)
271+
# print(f"{m=}")
268272
# Ex: [32, 'Content-Type', 'application/json']
269273
if m
270274
self.event_http_header(m[1], m[2])
271275
self.buf_in_offset += m[0]
272276
else # no more headers
273-
var m2 = global._re_http_srv_body.match2(self.buf_in, self.buf_in_offset)
277+
var m2 = re.match2(self.server.re_http_srv_body, self.buf_in, self.buf_in_offset)
278+
# print(f"{m2=}")
274279
if m2
275280
# end of headers
276281
# we keep \r\n which is used by pattern
@@ -519,9 +524,16 @@ class webserver_async
519524
var p1 # temporary object bytes() to avoid reallocation
520525

521526
# static var TIMEOUT = 1000 # default timeout: 1000ms
527+
528+
#############################################################
529+
# pre-compile REGEX
530+
#
522531
# static var HTTP_REQ = "^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n"
523532
# static var HTTP_HEADER_REGEX = "([A-Za-z0-9-]+): (.*?)\r\n" # extract a header with its 2 parts
524533
# static var HTTP_BODY_REGEX = "\r\n" # end of headers
534+
static var re_http_srv = re.compilebytes("^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n")
535+
static var re_http_srv_header = re.compilebytes("([A-Za-z0-9-]+): (.*?)\r\n")
536+
static var re_http_srv_body = re.compilebytes("\r\n")
525537

526538
#############################################################
527539
# init
@@ -535,27 +547,12 @@ class webserver_async
535547
self.cors = false
536548
self.p1 = bytes(100) # reserve 100 bytes by default
537549
# TODO what about max_clients ?
538-
self.compile_re()
539550
# register cb
540551
tasmota.add_driver(self)
541552
self.fastloop_cb = def () self.loop() end
542553
tasmota.add_fast_loop(self.fastloop_cb)
543554
end
544555

545-
#############################################################
546-
# compile once for all the regex
547-
def compile_re()
548-
import re
549-
if !global.contains("_re_http_srv")
550-
# global._re_http_srv = re.compile(self.HTTP_REQ)
551-
# global._re_http_srv_header = re.compile(self.HTTP_HEADER_REGEX)
552-
# global._re_http_srv_body = re.compile(self.HTTP_BODY_REGEX)
553-
global._re_http_srv = re.compile("^(\\w+) (\\S+) HTTP\\/(\\d\\.\\d)\r\n")
554-
global._re_http_srv_header = re.compile("([A-Za-z0-9-]+): (.*?)\r\n")
555-
global._re_http_srv_body = re.compile("\r\n")
556-
end
557-
end
558-
559556
#############################################################
560557
# enable or disable chunked mode (enabled by default)
561558
def set_chunked(chunked)

0 commit comments

Comments
 (0)