[macruby-changes] [3862] MacRuby/trunk

Wed Mar 24 23:12:32 PDT 2010

Revision: 3862
          http://trac.macosforge.org/projects/ruby/changeset/3862
Author:   lsansonetti at apple.com
Date:     2010-03-24 23:12:31 -0700 (Wed, 24 Mar 2010)
Log Message:
-----------
always use ucs2 mode, added shared regexp matchers for #gsub, #scan and #split

Modified Paths:
--------------
    MacRuby/trunk/re.cpp
    MacRuby/trunk/re.h
    MacRuby/trunk/string.c

Modified: MacRuby/trunk/re.cpp
===================================================================

--- MacRuby/trunk/re.cpp	2010-03-24 23:51:11 UTC (rev 3861)
+++ MacRuby/trunk/re.cpp	2010-03-25 06:12:31 UTC (rev 3862)
@@ -20,6 +20,8 @@
 VALUE rb_cRegexp;
 VALUE rb_cMatch;
 
+static VALUE rb_cRegexpMatcher;
+
 typedef struct rb_regexp {
     struct RBasic basic;
     UnicodeString *unistr;
@@ -550,33 +552,88 @@
  *     p lhs    # undefined local variable
  */
 
-int
-rb_reg_search(VALUE re, VALUE str, int pos, bool reverse)
+typedef struct rb_regexp_matcher {
+    struct RBasic basic;
+    UnicodeString *unistr;
+    RegexMatcher *matcher;
+} rb_regexp_matcher_t;
+
+static IMP regexp_matcher_finalize_imp_super = NULL; 
+
+static void
+regexp_matcher_finalize_imp(void *rcv, SEL sel)
 {
-    const long len = rb_str_chars_len(str);
-    if (pos > len || pos < 0) {
-	rb_backref_set(Qnil);
-	return -1;
+    rb_regexp_matcher_t *matcher = (rb_regexp_matcher_t *)rcv;
+    if (matcher->unistr != NULL) {
+	delete matcher->unistr;
+	matcher->unistr = NULL;
     }
+    if (matcher->matcher != NULL) {
+	delete matcher->matcher;
+	matcher->matcher = NULL;
+    }
+    if (regexp_matcher_finalize_imp_super != NULL) {
+	((void(*)(void *, SEL))regexp_matcher_finalize_imp_super)(rcv, sel);
+    }
+}
+VALUE
+rb_reg_matcher_new(VALUE re, VALUE str)
+{
+    NEWOBJ(matcher, struct rb_regexp_matcher);
+    OBJSETUP(matcher, rb_cRegexpMatcher, T_OBJECT);
 
     UnicodeString *unistr = str_to_unistr(str);
     assert(unistr != NULL);
 
     UErrorCode status = U_ZERO_ERROR;
     assert(RREGEXP(re)->pattern != NULL);
-    RegexMatcher *matcher = RREGEXP(re)->pattern->matcher(*unistr, status);
+    RegexMatcher *regexp_matcher =
+	RREGEXP(re)->pattern->matcher(*unistr, status);
 
-    if (matcher == NULL) {
+    if (regexp_matcher == NULL) {
 	delete unistr;
 	rb_raise(rb_eRegexpError, "can't create matcher: %s",
 		u_errorName(status));
     }
 
+    matcher->matcher = regexp_matcher;
+    matcher->unistr = unistr;
+
+    return (VALUE)matcher;
+}
+
+void
+rb_reg_matcher_destroy(VALUE matcher)
+{
+    rb_regexp_matcher_t *m = (rb_regexp_matcher_t *)matcher;
+    if (m ->unistr != NULL) {
+	delete m ->unistr;
+	m->unistr = NULL;
+    }
+    if (m ->matcher != NULL) {
+	delete m ->matcher;
+	m->matcher = NULL;
+    }
+    xfree(m);
+}
+
+int
+rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse)
+{
+    rb_regexp_matcher *re_matcher = (rb_regexp_matcher *)matcher;
+
+    if (pos > re_matcher->unistr->length() || pos < 0) {
+	rb_backref_set(Qnil);
+	return -1;
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+
     if (reverse) {
 	const int orig = pos;
 	while (pos >= 0) {
-	    if (matcher->find(pos, status)) {
-		if (matcher->start(status) <= orig) {
+	    if (re_matcher->matcher->find(pos, status)) {
+		if (re_matcher->matcher->start(status) <= orig) {
 		    break;
 		}
 	    }
@@ -584,20 +641,18 @@
 	}
 	if (pos < 0) {
 	    // No match.
-	    goto no_match;
+	    rb_backref_set(Qnil);
+	    return -1;
 	}
     }
-    else if (!matcher->find(pos, status)) {
+    else if (!re_matcher->matcher->find(pos, status)) {
 	// No match.
-no_match:
 	rb_backref_set(Qnil);
-	delete matcher;
-	delete unistr;
 	return -1;
     }
 
     // Match found.
-    const int res_count = 1 + matcher->groupCount();
+    const int res_count = 1 + re_matcher->matcher->groupCount();
     rb_match_result_t *res = NULL;
 
     VALUE match = rb_backref_get();
@@ -628,23 +683,22 @@
     }
 
     RMATCH(match)->results_count = res_count;
-    GC_WB(&RMATCH(match)->regexp, re);
+    if (RMATCH(match)->regexp != (rb_regexp_t *)re) {
+	GC_WB(&RMATCH(match)->regexp, re);
+    }
 
     rb_str_set_len(RMATCH(match)->str, 0);
-    rb_str_append_uchars(RMATCH(match)->str, unistr->getBuffer(),
-	    unistr->length());
+    rb_str_append_uchars(RMATCH(match)->str, re_matcher->unistr->getBuffer(),
+	    re_matcher->unistr->length());
 
-    res[0].beg = matcher->start(status);
-    res[0].end = matcher->end(status);
+    res[0].beg = re_matcher->matcher->start(status);
+    res[0].end = re_matcher->matcher->end(status);
 
-    for (int i = 0; i < matcher->groupCount(); i++) {
-	res[i + 1].beg = matcher->start(i + 1, status);
-	res[i + 1].end = matcher->end(i + 1, status);
+    for (int i = 0; i < re_matcher->matcher->groupCount(); i++) {
+	res[i + 1].beg = re_matcher->matcher->start(i + 1, status);
+	res[i + 1].end = re_matcher->matcher->end(i + 1, status);
     }
 
-    delete matcher;
-    delete unistr;
-
     return res[0].beg;
 }
 
@@ -1178,6 +1232,12 @@
     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(REGEXP_OPT_EXTENDED));
     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(REGEXP_OPT_MULTILINE));
 
+    rb_cRegexpMatcher = rb_define_class("_RegexpMatcher", rb_cObject);
+
+    regexp_matcher_finalize_imp_super = rb_objc_install_method2(
+	    (Class)rb_cRegexpMatcher, "finalize",
+	    (IMP)regexp_matcher_finalize_imp);
+
     Init_Match();
 }
 

Modified: MacRuby/trunk/re.h
===================================================================
--- MacRuby/trunk/re.h	2010-03-24 23:51:11 UTC (rev 3861)
+++ MacRuby/trunk/re.h	2010-03-25 06:12:31 UTC (rev 3862)
@@ -21,9 +21,21 @@
 
 VALUE rb_reg_quote(VALUE pat);
 VALUE rb_reg_regcomp(VALUE str);
-int rb_reg_search(VALUE re, VALUE str, int pos, bool reverse);
 VALUE rb_regexp_source(VALUE re);
 
+VALUE rb_reg_matcher_new(VALUE re, VALUE str);
+void rb_reg_matcher_destroy(VALUE matcher);
+int rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse);
+
+static inline int
+rb_reg_search(VALUE re, VALUE str, int pos, bool reverse)
+{
+    VALUE matcher = rb_reg_matcher_new(re, str);
+    const int res = rb_reg_matcher_search(re, matcher, pos, reverse);
+    rb_reg_matcher_destroy(matcher);
+    return res; 
+}
+
 int rb_reg_options_to_mri(int opt);
 int rb_reg_options_from_mri(int mri_opt);
 

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2010-03-24 23:51:11 UTC (rev 3861)
+++ MacRuby/trunk/string.c	2010-03-25 06:12:31 UTC (rev 3862)
@@ -1109,7 +1109,7 @@
 str_include_string(rb_str_t *self, rb_str_t *searched)
 {
     return str_offset_in_bytes_for_string(self, searched, 0,
-	    self->length_in_bytes, false) != -1;
+	    self->length_in_bytes, true) != -1;
 }
 
 static rb_str_t *
@@ -1144,7 +1144,7 @@
     if (IS_RSTR(str)) {
 	if (str_try_making_data_uchars(RSTR(str))) {
 	    chars = RSTR(str)->data.uchars;
-	    chars_len = str_length(RSTR(str), false);
+	    chars_len = str_length(RSTR(str), true);
 	}
 	else {
 	    //assert(BINARY_ENC(RSTR(str)->encoding));
@@ -1180,7 +1180,7 @@
 	return Qnil;
     }
 
-    const long n = str_length(RSTR(str), false);
+    const long n = str_length(RSTR(str), true);
     if (beg < 0) {
 	beg += n;
     }
@@ -1194,7 +1194,7 @@
 	len = n - beg;
     }
 
-    rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, false);
+    rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, true);
     return substr == NULL ? Qnil : (VALUE)substr;
 }
 
@@ -1207,7 +1207,7 @@
 	rb_raise(rb_eIndexError, "negative length %ld", len);
     }
 
-    const long slen = str_length(RSTR(self), false);
+    const long slen = str_length(RSTR(self), true);
     if (slen < beg) {
 out_of_range:
 	rb_raise(rb_eIndexError, "index %ld out of string", beg);
@@ -1224,7 +1224,7 @@
 
     rstr_modify(self);
 
-    str_splice(RSTR(self), beg, len, strstr, false);
+    str_splice(RSTR(self), beg, len, strstr, true);
 
     if (OBJ_TAINTED(strstr)) {
 	OBJ_TAINT(self);
@@ -1698,8 +1698,8 @@
 	default:
 	    {
 		long beg = 0, len = 0;
-		switch (rb_range_beg_len(indx, &beg, &len, str_length(RSTR(str),
-				false), 0)) {
+		switch (rb_range_beg_len(indx, &beg, &len,
+			    str_length(RSTR(str), true), 0)) {
 		    case Qfalse:
 			break;
 		    case Qnil:
@@ -1941,7 +1941,7 @@
 static VALUE
 rstr_index(VALUE self, SEL sel, int argc, VALUE *argv)
 {
-    const long len = str_length(RSTR(self), false);
+    const long len = str_length(RSTR(self), true);
     VALUE sub, initpos;
     long pos;
 
@@ -2009,7 +2009,7 @@
 static VALUE
 rstr_rindex(VALUE self, SEL sel, int argc, VALUE *argv)
 {
-    const long len = str_length(RSTR(self), false);
+    const long len = str_length(RSTR(self), true);
     VALUE sub, initpos;
     long pos;
 
@@ -2360,7 +2360,7 @@
 	    continue;
 	}
 	const long pos = str_index_for_string(RSTR(str), str_need_string(tmp),
-		0, rb_str_chars_len(tmp), false, false);
+		0, rb_str_chars_len(tmp), false, true);
 	if (pos == 0) {
 	    return Qtrue;
 	}
@@ -2389,7 +2389,7 @@
 	    continue;
 	}
 	const long pos = str_index_for_string(RSTR(str), str_need_string(tmp),
-		len - sublen, len, false, false);
+		len - sublen, len, false, true);
 	if (pos == len - sublen) {
 	    return Qtrue;
 	}
@@ -2730,9 +2730,11 @@
 	ary = rb_ary_new();
     }
 
+    VALUE matcher = rb_reg_matcher_new(pat, self);
+
     VALUE match = Qnil;
     long start = 0;
-    while (rb_reg_search(pat, self, start, false) >= 0) {
+    while (rb_reg_matcher_search(pat, matcher, start, false) >= 0) {
 	match = rb_backref_get();
 
 	int count = 0;
@@ -2777,6 +2779,8 @@
 
     rb_backref_set(match);
 
+    rb_reg_matcher_destroy(matcher);
+
     return block_given ? self : ary;
 }
 
@@ -2827,7 +2831,7 @@
 static VALUE
 rstr_split(VALUE str, SEL sel, int argc, VALUE *argv)
 {
-    const long len = str_length(RSTR(str), false);
+    const long len = str_length(RSTR(str), true);
     int lim = 0;
 
     VALUE spat, limit;
@@ -2914,10 +2918,10 @@
 	}
 	else {
 	    rb_str_t *spat_str = str_need_string(spat);
-	    const long spat_len = str_length(spat_str, false);
+	    const long spat_len = str_length(spat_str, true);
 	    do {
 		const long pos = str_index_for_string(RSTR(str), spat_str,
-			beg, -1, false, false);
+			beg, -1, false, true);
 		if (pos == -1) {
 		    break;
 		}
@@ -2930,9 +2934,10 @@
     else {
 	long start = beg;
 	bool last_null = false;
+	VALUE matcher = rb_reg_matcher_new(spat, str);
 again:
 	do {
-	    const long pos = rb_reg_search(spat, str, start, false);
+	    const long pos = rb_reg_matcher_search(spat, matcher, start, false);
 	    if (pos < 0) {
 		break;
 	    }
@@ -2982,6 +2987,8 @@
 	    }
 	}
 	while (limit == Qnil || --lim > 1);
+
+	rb_reg_matcher_destroy(matcher);
     }
 
     if (len > 0 && (!NIL_P(limit) || len > beg || lim_orig < 0)) {
@@ -3181,10 +3188,10 @@
     if (rs == rb_default_rs
 	|| (rslen == 1 && rb_str_get_uchar(rs, 0) == '\n')) {
 	// Remove trailing carriage return.
-	UChar c = str_get_uchar(RSTR(str), len - 1, false);
+	UChar c = str_get_uchar(RSTR(str), len - 1, true);
 	if (c == '\n') {
 	    to_del++;
-	    c = len > 1 ? str_get_uchar(RSTR(str), len - 2, false) : 0;
+	    c = len > 1 ? str_get_uchar(RSTR(str), len - 2, true) : 0;
 	}
 	if (c == '\r' && (rslen > 0 || to_del != 0)) {
 	    to_del++;
@@ -3193,12 +3200,12 @@
     else if (rslen == 0) {
 	// Remove all trailing carriage returns.
 	for (int i = len - 1; i >= 0; i--) {
-	    UChar c = str_get_uchar(RSTR(str), i, false);
+	    UChar c = str_get_uchar(RSTR(str), i, true);
 	    if (c != '\n') {
 		break;
 	    }
 	    to_del++;
-	    if (i > 0 && str_get_uchar(RSTR(str), i - 1, false) == '\r') {
+	    if (i > 0 && str_get_uchar(RSTR(str), i - 1, true) == '\r') {
 		to_del++;
 		i--;
 	    }
@@ -3207,7 +3214,7 @@
     else if (rslen <= len) {
 	// Remove trailing substring.
 	if (str_index_for_string(RSTR(str), str_need_string(rs),
-		    len - rslen, -1, false, false) >= 0) {
+		    len - rslen, -1, false, true) >= 0) {
 	    to_del += rslen;
 	}
     }
@@ -3215,7 +3222,7 @@
     if (to_del == 0) {
 	return Qnil;
     }
-    str_delete(RSTR(str), len - to_del, to_del, false);
+    str_delete(RSTR(str), len - to_del, to_del, true);
     return str;
 }
 
@@ -3260,7 +3267,7 @@
 {
     rstr_modify(str);
 
-    const long len = str_length(RSTR(str), false);
+    const long len = str_length(RSTR(str), true);
     if (len == 0) {
 	return Qnil;
     }
@@ -3271,7 +3278,7 @@
 	to_del++;
     }
 
-    str_delete(RSTR(str), len - to_del, to_del, false);
+    str_delete(RSTR(str), len - to_del, to_del, true);
     return str;
 }
 
@@ -3481,7 +3488,7 @@
 
 	rstr_modify(str);
 	str_splice(RSTR(str), results[0].beg, results[0].end - results[0].beg,
-		str_need_string(repl), false);
+		str_need_string(repl), true);
 	if (OBJ_TAINTED(repl)) {
 	    tainted = true;
 	}
@@ -3575,7 +3582,7 @@
     VALUE dest = rb_str_new5(str, NULL, 0);
     long offset = 0, last = 0;
     bool changed = false;
-    const long len = str_length(RSTR(str), false);
+    const long len = str_length(RSTR(str), true);
     VALUE match = Qnil;
 
     if (bang) {
@@ -3583,10 +3590,13 @@
 	rstr_modify(str);
     }
 
+    VALUE matcher = rb_reg_matcher_new(pat, str);
+
     while (true) {
-        const long pos = rb_reg_search(pat, str, offset, false);
+        const long pos = rb_reg_matcher_search(pat, matcher, offset, false);
 	if (pos < 0) {
 	    if (!changed) {
+		rb_reg_matcher_destroy(matcher);
 		return bang ? Qnil : rstr_dup(str, 0);
 	    }
 	    if (last < len) {
@@ -3639,6 +3649,8 @@
 	}
     }
 
+    rb_reg_matcher_destroy(matcher);
+
     rb_backref_set(match);
 
     if (bang) {
@@ -3939,7 +3951,7 @@
 	if (padwidth > width) {
 	    pad = RSTR(rstr_substr((VALUE)pad, 0, width));
 	}
-	str_insert(str, index, pad, false);
+	str_insert(str, index, pad, true);
 	width -= padwidth;
 	index += padwidth;
     }
@@ -3960,12 +3972,12 @@
     }
 
     rb_str_t *padstr = str_need_string(pad);
-    const long padwidth = str_length(RSTR(padstr), false);
+    const long padwidth = str_length(RSTR(padstr), true);
     if (padwidth == 0) {
 	rb_raise(rb_eArgError, "zero width padding");
     }
 
-    const long len = str_length(RSTR(str), false);
+    const long len = str_length(RSTR(str), true);
     long width = NUM2LONG(w);
     str = rb_str_new3(str);
     if (width < 0 || width <= len) {
@@ -4051,7 +4063,7 @@
 {
     rstr_modify(str);
 
-    long len = str_length(RSTR(str), false);
+    long len = str_length(RSTR(str), true);
     if (len == 0) {
 	return Qnil;
     }
@@ -4069,7 +4081,7 @@
 	}
 
 	if (pos > 0) {
-	    str_delete(RSTR(str), 0, pos, false);
+	    str_delete(RSTR(str), 0, pos, true);
 	    len -= pos;
 	    changed = true;
 	}
@@ -4087,7 +4099,7 @@
 	}
 
 	if (pos < len - 1 && pos >= 0) {
-	    str_delete(RSTR(str), pos + 1, len - pos - 1, false);
+	    str_delete(RSTR(str), pos + 1, len - pos - 1, true);
 	    changed = true;
 	}
     }
@@ -4260,13 +4272,13 @@
 	rs_str = str_need_string(rb_default_rs);
     }
 
-    const long len = str_length(RSTR(str), false);
+    const long len = str_length(RSTR(str), true);
     const bool tainted = OBJ_TAINTED(str);
 
     long pos = 0;
     do {
 	const long off = str_index_for_string(RSTR(str), rs_str, pos, -1,
-		false, false);
+		false, true);
 
 	long substr_len = 0;
 	if (off < 0) {
@@ -5929,7 +5941,7 @@
 rb_str_get_uchar(VALUE str, long pos)
 {
     if (RSTR(str)) {
-	return str_get_uchar(RSTR(str), pos, false);
+	return str_get_uchar(RSTR(str), pos, true);
     }
     assert(pos >= 0 && pos < CFStringGetLength((CFStringRef)str));
     return CFStringGetCharacterAtIndex((CFStringRef)str, pos);
@@ -5965,7 +5977,7 @@
 rb_str_chars_len(VALUE str)
 {
     if (IS_RSTR(str)) {
-	return str_length(RSTR(str), false);
+	return str_length(RSTR(str), true);
     }
     return CFStringGetLength((CFStringRef)str);
 }
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20100324/cefe9541/attachment-0001.html>