Revision: 3862 http://trac.macosforge.org/projects/ruby/changeset/3862 Author: lsansonetti@apple.com Date: 2010-03-24 23:12:31 -0700 (Wed, 24 Mar 2010) Log Message: ----------- always use ucs2 mode, added shared regexp matchers for #gsub, #scan and #split Modified Paths: -------------- MacRuby/trunk/re.cpp MacRuby/trunk/re.h MacRuby/trunk/string.c Modified: MacRuby/trunk/re.cpp =================================================================== --- MacRuby/trunk/re.cpp 2010-03-24 23:51:11 UTC (rev 3861) +++ MacRuby/trunk/re.cpp 2010-03-25 06:12:31 UTC (rev 3862) @@ -20,6 +20,8 @@ VALUE rb_cRegexp; VALUE rb_cMatch; +static VALUE rb_cRegexpMatcher; + typedef struct rb_regexp { struct RBasic basic; UnicodeString *unistr; @@ -550,33 +552,88 @@ * p lhs # undefined local variable */ -int -rb_reg_search(VALUE re, VALUE str, int pos, bool reverse) +typedef struct rb_regexp_matcher { + struct RBasic basic; + UnicodeString *unistr; + RegexMatcher *matcher; +} rb_regexp_matcher_t; + +static IMP regexp_matcher_finalize_imp_super = NULL; + +static void +regexp_matcher_finalize_imp(void *rcv, SEL sel) { - const long len = rb_str_chars_len(str); - if (pos > len || pos < 0) { - rb_backref_set(Qnil); - return -1; + rb_regexp_matcher_t *matcher = (rb_regexp_matcher_t *)rcv; + if (matcher->unistr != NULL) { + delete matcher->unistr; + matcher->unistr = NULL; } + if (matcher->matcher != NULL) { + delete matcher->matcher; + matcher->matcher = NULL; + } + if (regexp_matcher_finalize_imp_super != NULL) { + ((void(*)(void *, SEL))regexp_matcher_finalize_imp_super)(rcv, sel); + } +} +VALUE +rb_reg_matcher_new(VALUE re, VALUE str) +{ + NEWOBJ(matcher, struct rb_regexp_matcher); + OBJSETUP(matcher, rb_cRegexpMatcher, T_OBJECT); UnicodeString *unistr = str_to_unistr(str); assert(unistr != NULL); UErrorCode status = U_ZERO_ERROR; assert(RREGEXP(re)->pattern != NULL); - RegexMatcher *matcher = RREGEXP(re)->pattern->matcher(*unistr, status); + RegexMatcher *regexp_matcher = + RREGEXP(re)->pattern->matcher(*unistr, status); - if (matcher == NULL) { + if (regexp_matcher == NULL) { delete unistr; rb_raise(rb_eRegexpError, "can't create matcher: %s", u_errorName(status)); } + matcher->matcher = regexp_matcher; + matcher->unistr = unistr; + + return (VALUE)matcher; +} + +void +rb_reg_matcher_destroy(VALUE matcher) +{ + rb_regexp_matcher_t *m = (rb_regexp_matcher_t *)matcher; + if (m ->unistr != NULL) { + delete m ->unistr; + m->unistr = NULL; + } + if (m ->matcher != NULL) { + delete m ->matcher; + m->matcher = NULL; + } + xfree(m); +} + +int +rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse) +{ + rb_regexp_matcher *re_matcher = (rb_regexp_matcher *)matcher; + + if (pos > re_matcher->unistr->length() || pos < 0) { + rb_backref_set(Qnil); + return -1; + } + + UErrorCode status = U_ZERO_ERROR; + if (reverse) { const int orig = pos; while (pos >= 0) { - if (matcher->find(pos, status)) { - if (matcher->start(status) <= orig) { + if (re_matcher->matcher->find(pos, status)) { + if (re_matcher->matcher->start(status) <= orig) { break; } } @@ -584,20 +641,18 @@ } if (pos < 0) { // No match. - goto no_match; + rb_backref_set(Qnil); + return -1; } } - else if (!matcher->find(pos, status)) { + else if (!re_matcher->matcher->find(pos, status)) { // No match. -no_match: rb_backref_set(Qnil); - delete matcher; - delete unistr; return -1; } // Match found. - const int res_count = 1 + matcher->groupCount(); + const int res_count = 1 + re_matcher->matcher->groupCount(); rb_match_result_t *res = NULL; VALUE match = rb_backref_get(); @@ -628,23 +683,22 @@ } RMATCH(match)->results_count = res_count; - GC_WB(&RMATCH(match)->regexp, re); + if (RMATCH(match)->regexp != (rb_regexp_t *)re) { + GC_WB(&RMATCH(match)->regexp, re); + } rb_str_set_len(RMATCH(match)->str, 0); - rb_str_append_uchars(RMATCH(match)->str, unistr->getBuffer(), - unistr->length()); + rb_str_append_uchars(RMATCH(match)->str, re_matcher->unistr->getBuffer(), + re_matcher->unistr->length()); - res[0].beg = matcher->start(status); - res[0].end = matcher->end(status); + res[0].beg = re_matcher->matcher->start(status); + res[0].end = re_matcher->matcher->end(status); - for (int i = 0; i < matcher->groupCount(); i++) { - res[i + 1].beg = matcher->start(i + 1, status); - res[i + 1].end = matcher->end(i + 1, status); + for (int i = 0; i < re_matcher->matcher->groupCount(); i++) { + res[i + 1].beg = re_matcher->matcher->start(i + 1, status); + res[i + 1].end = re_matcher->matcher->end(i + 1, status); } - delete matcher; - delete unistr; - return res[0].beg; } @@ -1178,6 +1232,12 @@ rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(REGEXP_OPT_EXTENDED)); rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(REGEXP_OPT_MULTILINE)); + rb_cRegexpMatcher = rb_define_class("_RegexpMatcher", rb_cObject); + + regexp_matcher_finalize_imp_super = rb_objc_install_method2( + (Class)rb_cRegexpMatcher, "finalize", + (IMP)regexp_matcher_finalize_imp); + Init_Match(); } Modified: MacRuby/trunk/re.h =================================================================== --- MacRuby/trunk/re.h 2010-03-24 23:51:11 UTC (rev 3861) +++ MacRuby/trunk/re.h 2010-03-25 06:12:31 UTC (rev 3862) @@ -21,9 +21,21 @@ VALUE rb_reg_quote(VALUE pat); VALUE rb_reg_regcomp(VALUE str); -int rb_reg_search(VALUE re, VALUE str, int pos, bool reverse); VALUE rb_regexp_source(VALUE re); +VALUE rb_reg_matcher_new(VALUE re, VALUE str); +void rb_reg_matcher_destroy(VALUE matcher); +int rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse); + +static inline int +rb_reg_search(VALUE re, VALUE str, int pos, bool reverse) +{ + VALUE matcher = rb_reg_matcher_new(re, str); + const int res = rb_reg_matcher_search(re, matcher, pos, reverse); + rb_reg_matcher_destroy(matcher); + return res; +} + int rb_reg_options_to_mri(int opt); int rb_reg_options_from_mri(int mri_opt); Modified: MacRuby/trunk/string.c =================================================================== --- MacRuby/trunk/string.c 2010-03-24 23:51:11 UTC (rev 3861) +++ MacRuby/trunk/string.c 2010-03-25 06:12:31 UTC (rev 3862) @@ -1109,7 +1109,7 @@ str_include_string(rb_str_t *self, rb_str_t *searched) { return str_offset_in_bytes_for_string(self, searched, 0, - self->length_in_bytes, false) != -1; + self->length_in_bytes, true) != -1; } static rb_str_t * @@ -1144,7 +1144,7 @@ if (IS_RSTR(str)) { if (str_try_making_data_uchars(RSTR(str))) { chars = RSTR(str)->data.uchars; - chars_len = str_length(RSTR(str), false); + chars_len = str_length(RSTR(str), true); } else { //assert(BINARY_ENC(RSTR(str)->encoding)); @@ -1180,7 +1180,7 @@ return Qnil; } - const long n = str_length(RSTR(str), false); + const long n = str_length(RSTR(str), true); if (beg < 0) { beg += n; } @@ -1194,7 +1194,7 @@ len = n - beg; } - rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, false); + rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, true); return substr == NULL ? Qnil : (VALUE)substr; } @@ -1207,7 +1207,7 @@ rb_raise(rb_eIndexError, "negative length %ld", len); } - const long slen = str_length(RSTR(self), false); + const long slen = str_length(RSTR(self), true); if (slen < beg) { out_of_range: rb_raise(rb_eIndexError, "index %ld out of string", beg); @@ -1224,7 +1224,7 @@ rstr_modify(self); - str_splice(RSTR(self), beg, len, strstr, false); + str_splice(RSTR(self), beg, len, strstr, true); if (OBJ_TAINTED(strstr)) { OBJ_TAINT(self); @@ -1698,8 +1698,8 @@ default: { long beg = 0, len = 0; - switch (rb_range_beg_len(indx, &beg, &len, str_length(RSTR(str), - false), 0)) { + switch (rb_range_beg_len(indx, &beg, &len, + str_length(RSTR(str), true), 0)) { case Qfalse: break; case Qnil: @@ -1941,7 +1941,7 @@ static VALUE rstr_index(VALUE self, SEL sel, int argc, VALUE *argv) { - const long len = str_length(RSTR(self), false); + const long len = str_length(RSTR(self), true); VALUE sub, initpos; long pos; @@ -2009,7 +2009,7 @@ static VALUE rstr_rindex(VALUE self, SEL sel, int argc, VALUE *argv) { - const long len = str_length(RSTR(self), false); + const long len = str_length(RSTR(self), true); VALUE sub, initpos; long pos; @@ -2360,7 +2360,7 @@ continue; } const long pos = str_index_for_string(RSTR(str), str_need_string(tmp), - 0, rb_str_chars_len(tmp), false, false); + 0, rb_str_chars_len(tmp), false, true); if (pos == 0) { return Qtrue; } @@ -2389,7 +2389,7 @@ continue; } const long pos = str_index_for_string(RSTR(str), str_need_string(tmp), - len - sublen, len, false, false); + len - sublen, len, false, true); if (pos == len - sublen) { return Qtrue; } @@ -2730,9 +2730,11 @@ ary = rb_ary_new(); } + VALUE matcher = rb_reg_matcher_new(pat, self); + VALUE match = Qnil; long start = 0; - while (rb_reg_search(pat, self, start, false) >= 0) { + while (rb_reg_matcher_search(pat, matcher, start, false) >= 0) { match = rb_backref_get(); int count = 0; @@ -2777,6 +2779,8 @@ rb_backref_set(match); + rb_reg_matcher_destroy(matcher); + return block_given ? self : ary; } @@ -2827,7 +2831,7 @@ static VALUE rstr_split(VALUE str, SEL sel, int argc, VALUE *argv) { - const long len = str_length(RSTR(str), false); + const long len = str_length(RSTR(str), true); int lim = 0; VALUE spat, limit; @@ -2914,10 +2918,10 @@ } else { rb_str_t *spat_str = str_need_string(spat); - const long spat_len = str_length(spat_str, false); + const long spat_len = str_length(spat_str, true); do { const long pos = str_index_for_string(RSTR(str), spat_str, - beg, -1, false, false); + beg, -1, false, true); if (pos == -1) { break; } @@ -2930,9 +2934,10 @@ else { long start = beg; bool last_null = false; + VALUE matcher = rb_reg_matcher_new(spat, str); again: do { - const long pos = rb_reg_search(spat, str, start, false); + const long pos = rb_reg_matcher_search(spat, matcher, start, false); if (pos < 0) { break; } @@ -2982,6 +2987,8 @@ } } while (limit == Qnil || --lim > 1); + + rb_reg_matcher_destroy(matcher); } if (len > 0 && (!NIL_P(limit) || len > beg || lim_orig < 0)) { @@ -3181,10 +3188,10 @@ if (rs == rb_default_rs || (rslen == 1 && rb_str_get_uchar(rs, 0) == '\n')) { // Remove trailing carriage return. - UChar c = str_get_uchar(RSTR(str), len - 1, false); + UChar c = str_get_uchar(RSTR(str), len - 1, true); if (c == '\n') { to_del++; - c = len > 1 ? str_get_uchar(RSTR(str), len - 2, false) : 0; + c = len > 1 ? str_get_uchar(RSTR(str), len - 2, true) : 0; } if (c == '\r' && (rslen > 0 || to_del != 0)) { to_del++; @@ -3193,12 +3200,12 @@ else if (rslen == 0) { // Remove all trailing carriage returns. for (int i = len - 1; i >= 0; i--) { - UChar c = str_get_uchar(RSTR(str), i, false); + UChar c = str_get_uchar(RSTR(str), i, true); if (c != '\n') { break; } to_del++; - if (i > 0 && str_get_uchar(RSTR(str), i - 1, false) == '\r') { + if (i > 0 && str_get_uchar(RSTR(str), i - 1, true) == '\r') { to_del++; i--; } @@ -3207,7 +3214,7 @@ else if (rslen <= len) { // Remove trailing substring. if (str_index_for_string(RSTR(str), str_need_string(rs), - len - rslen, -1, false, false) >= 0) { + len - rslen, -1, false, true) >= 0) { to_del += rslen; } } @@ -3215,7 +3222,7 @@ if (to_del == 0) { return Qnil; } - str_delete(RSTR(str), len - to_del, to_del, false); + str_delete(RSTR(str), len - to_del, to_del, true); return str; } @@ -3260,7 +3267,7 @@ { rstr_modify(str); - const long len = str_length(RSTR(str), false); + const long len = str_length(RSTR(str), true); if (len == 0) { return Qnil; } @@ -3271,7 +3278,7 @@ to_del++; } - str_delete(RSTR(str), len - to_del, to_del, false); + str_delete(RSTR(str), len - to_del, to_del, true); return str; } @@ -3481,7 +3488,7 @@ rstr_modify(str); str_splice(RSTR(str), results[0].beg, results[0].end - results[0].beg, - str_need_string(repl), false); + str_need_string(repl), true); if (OBJ_TAINTED(repl)) { tainted = true; } @@ -3575,7 +3582,7 @@ VALUE dest = rb_str_new5(str, NULL, 0); long offset = 0, last = 0; bool changed = false; - const long len = str_length(RSTR(str), false); + const long len = str_length(RSTR(str), true); VALUE match = Qnil; if (bang) { @@ -3583,10 +3590,13 @@ rstr_modify(str); } + VALUE matcher = rb_reg_matcher_new(pat, str); + while (true) { - const long pos = rb_reg_search(pat, str, offset, false); + const long pos = rb_reg_matcher_search(pat, matcher, offset, false); if (pos < 0) { if (!changed) { + rb_reg_matcher_destroy(matcher); return bang ? Qnil : rstr_dup(str, 0); } if (last < len) { @@ -3639,6 +3649,8 @@ } } + rb_reg_matcher_destroy(matcher); + rb_backref_set(match); if (bang) { @@ -3939,7 +3951,7 @@ if (padwidth > width) { pad = RSTR(rstr_substr((VALUE)pad, 0, width)); } - str_insert(str, index, pad, false); + str_insert(str, index, pad, true); width -= padwidth; index += padwidth; } @@ -3960,12 +3972,12 @@ } rb_str_t *padstr = str_need_string(pad); - const long padwidth = str_length(RSTR(padstr), false); + const long padwidth = str_length(RSTR(padstr), true); if (padwidth == 0) { rb_raise(rb_eArgError, "zero width padding"); } - const long len = str_length(RSTR(str), false); + const long len = str_length(RSTR(str), true); long width = NUM2LONG(w); str = rb_str_new3(str); if (width < 0 || width <= len) { @@ -4051,7 +4063,7 @@ { rstr_modify(str); - long len = str_length(RSTR(str), false); + long len = str_length(RSTR(str), true); if (len == 0) { return Qnil; } @@ -4069,7 +4081,7 @@ } if (pos > 0) { - str_delete(RSTR(str), 0, pos, false); + str_delete(RSTR(str), 0, pos, true); len -= pos; changed = true; } @@ -4087,7 +4099,7 @@ } if (pos < len - 1 && pos >= 0) { - str_delete(RSTR(str), pos + 1, len - pos - 1, false); + str_delete(RSTR(str), pos + 1, len - pos - 1, true); changed = true; } } @@ -4260,13 +4272,13 @@ rs_str = str_need_string(rb_default_rs); } - const long len = str_length(RSTR(str), false); + const long len = str_length(RSTR(str), true); const bool tainted = OBJ_TAINTED(str); long pos = 0; do { const long off = str_index_for_string(RSTR(str), rs_str, pos, -1, - false, false); + false, true); long substr_len = 0; if (off < 0) { @@ -5929,7 +5941,7 @@ rb_str_get_uchar(VALUE str, long pos) { if (RSTR(str)) { - return str_get_uchar(RSTR(str), pos, false); + return str_get_uchar(RSTR(str), pos, true); } assert(pos >= 0 && pos < CFStringGetLength((CFStringRef)str)); return CFStringGetCharacterAtIndex((CFStringRef)str, pos); @@ -5965,7 +5977,7 @@ rb_str_chars_len(VALUE str) { if (IS_RSTR(str)) { - return str_length(RSTR(str), false); + return str_length(RSTR(str), true); } return CFStringGetLength((CFStringRef)str); }
participants (1)
-
source_changes@macosforge.org