Commit e90b85cc authored by Jeff Epler's avatar Jeff Epler Committed by Damien George

extmod/modure: Convert byte offsets to unicode indices when necessary.

And add a test.

Fixes issue #9202.
Signed-off-by: default avatarJeff Epler <jepler@gmail.com>
parent 719dbbf5
...@@ -33,6 +33,10 @@ ...@@ -33,6 +33,10 @@
#include "py/objstr.h" #include "py/objstr.h"
#include "py/stackctrl.h" #include "py/stackctrl.h"
#if MICROPY_PY_BUILTINS_STR_UNICODE
#include "py/unicode.h"
#endif
#if MICROPY_PY_URE #if MICROPY_PY_URE
#define re1_5_stack_chk() MP_STACK_CHECK() #define re1_5_stack_chk() MP_STACK_CHECK()
...@@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span ...@@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
e = self->caps[no * 2 + 1] - begin; e = self->caps[no * 2 + 1] - begin;
} }
#if MICROPY_PY_BUILTINS_STR_UNICODE
if (mp_obj_get_type(self->str) == &mp_type_str) {
const byte *begin = (const byte *)mp_obj_str_get_str(self->str);
if (s != -1) {
s = utf8_ptr_to_index(begin, begin + s);
}
if (e != -1) {
e = utf8_ptr_to_index(begin, begin + e);
}
}
#endif
span[0] = mp_obj_new_int(s); span[0] = mp_obj_new_int(s);
span[1] = mp_obj_new_int(e); span[1] = mp_obj_new_int(e);
} }
......
# test match.span() for unicode strings
try:
import ure as re
except ImportError:
try:
import re
except ImportError:
print("SKIP")
raise SystemExit
try:
m = re.match(".", "a")
m.span
except AttributeError:
print("SKIP")
raise SystemExit
def print_spans(match):
print("----")
try:
i = 0
while True:
print(match.span(i), match.start(i), match.end(i))
i += 1
except IndexError:
pass
m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234\u2764567")
print_spans(m)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment