[macruby-changes] [3577] MacRuby/branches/icu

source_changes at macosforge.org source_changes at macosforge.org
Thu Feb 18 22:17:12 PST 2010


Revision: 3577
          http://trac.macosforge.org/projects/ruby/changeset/3577
Author:   lsansonetti at apple.com
Date:     2010-02-18 22:17:08 -0800 (Thu, 18 Feb 2010)
Log Message:
-----------
almost done with regexp

Modified Paths:
--------------
    MacRuby/branches/icu/include/ruby/intern.h
    MacRuby/branches/icu/re.cpp
    MacRuby/branches/icu/vm.cpp

Modified: MacRuby/branches/icu/include/ruby/intern.h
===================================================================
--- MacRuby/branches/icu/include/ruby/intern.h	2010-02-19 03:58:06 UTC (rev 3576)
+++ MacRuby/branches/icu/include/ruby/intern.h	2010-02-19 06:17:08 UTC (rev 3577)
@@ -514,6 +514,8 @@
 VALUE rb_reg_nth_match(int, VALUE);
 VALUE rb_reg_last_match(VALUE);
 VALUE rb_reg_match_last(VALUE);
+VALUE rb_reg_match_pre(VALUE);
+VALUE rb_reg_match_post(VALUE);
 #define HAVE_RB_REG_NEW_STR 1
 VALUE rb_reg_new_str(VALUE, int);
 VALUE rb_reg_new(const char *, long, int);

Modified: MacRuby/branches/icu/re.cpp
===================================================================
--- MacRuby/branches/icu/re.cpp	2010-02-19 03:58:06 UTC (rev 3576)
+++ MacRuby/branches/icu/re.cpp	2010-02-19 06:17:08 UTC (rev 3577)
@@ -123,6 +123,16 @@
     return unistr;
 }
 
+static VALUE
+unistr_subseq(UnicodeString *unistr, int beg, int len)
+{
+    assert(unistr != NULL);
+    assert(beg + len <= unistr->length());
+
+    const UChar *chars = unistr->getBuffer();
+    return rb_unicode_str_new(&chars[beg], len);
+}
+
 static bool
 init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
 {
@@ -187,6 +197,14 @@
 	case 'm':
 	    *option = REGEXP_OPT_MULTILINE;
 	    return true;
+
+	// Stupid MRI encoding flags, let's ignore them for now.
+	case 'n':
+	case 'e':
+	case 'u':
+	case 's':
+	    *option = 0;
+	    return true;
     }
     *option = -1;
     return false;
@@ -378,6 +396,7 @@
     }
 
     if (!matcher->find()) {
+	delete unistr;
 	delete matcher;
 	rb_backref_set(Qnil);
 	return -1;
@@ -705,9 +724,6 @@
 	    rb_reg_s_try_convert, 1);
 #endif
 
-    regexp_finalize_imp_super = rb_objc_install_method2((Class)rb_cRegexp,
-	    "finalize", (IMP)regexp_finalize_imp);
-
     rb_objc_define_method(rb_cRegexp, "initialize",
 	    (void *)regexp_initialize, -1);
     rb_objc_define_method(rb_cRegexp, "initialize_copy",
@@ -733,6 +749,9 @@
 #endif
     rb_objc_define_method(rb_cRegexp, "inspect", (void *)regexp_inspect, 0);
 
+    regexp_finalize_imp_super = rb_objc_install_method2((Class)rb_cRegexp,
+	    "finalize", (IMP)regexp_finalize_imp);
+
     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(REGEXP_OPT_IGNORECASE));
     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(REGEXP_OPT_EXTENDED));
     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(REGEXP_OPT_MULTILINE));
@@ -740,39 +759,399 @@
     Init_Match();
 }
 
-VALUE
-rb_reg_nth_match(int nth, VALUE match)
+static VALUE
+match_initialize_copy(VALUE rcv, SEL sel, VALUE other)
 {
-    if (NIL_P(match)) {
-	return Qnil;
+    if (TYPE(other) != T_MATCH) {
+	rb_raise(rb_eTypeError, "wrong argument type");
     }
-    if (nth >= RMATCH(match)->results_count) {
-	return Qnil;
+
+    match_finalize(RMATCH(rcv));
+
+    RMATCH(rcv)->unistr = new UnicodeString(*RMATCH(other)->unistr);
+    GC_WB(&RMATCH(rcv)->regexp, RMATCH(other)->regexp);
+
+    const long len = sizeof(rb_match_result_t) * RMATCH(other)->results_count;
+    rb_match_result_t *res = (rb_match_result_t *)xmalloc(len);
+    memcpy(res, RMATCH(other)->results, len);
+    GC_WB(&RMATCH(rcv)->results, res);
+
+    return rcv;
+}
+
+/*
+ * call-seq:
+ *    mtch.regexp   => regexp
+ *
+ * Returns the regexp.
+ *
+ *     m = /a.*b/.match("abc")
+ *     m.regexp #=> /a.*b/
+ */
+
+static VALUE
+match_regexp(VALUE rcv, SEL sel)
+{
+    assert(RMATCH(rcv)->regexp != NULL);
+    return (VALUE)RMATCH(rcv)->regexp;
+}
+
+/*
+ * call-seq:
+ *    mtch.names   => [name1, name2, ...]
+ *
+ * Returns a list of names of captures as an array of strings.
+ * It is same as mtch.regexp.names.
+ *
+ *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
+ *     #=> ["foo", "bar", "baz"]
+ *
+ *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
+ *     m.names                          #=> ["x", "y"]
+ */
+
+static VALUE
+match_names(VALUE rcv, SEL sel)
+{
+    // TODO
+    return rb_ary_new();
+}
+
+/*
+ *  call-seq:
+ *     mtch.length   => integer
+ *     mtch.size     => integer
+ *
+ *  Returns the number of elements in the match array.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.length   #=> 5
+ *     m.size     #=> 5
+ */
+
+static VALUE
+match_size(VALUE rcv, SEL sel)
+{
+    return INT2FIX(RMATCH(rcv)->results_count);
+}
+
+/*
+ *  call-seq:
+ *     mtch.offset(n)   => array
+ *
+ *  Returns a two-element array containing the beginning and ending offsets of
+ *  the <em>n</em>th match.
+ *  <em>n</em> can be a string or symbol to reference a named capture.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.offset(0)      #=> [1, 7]
+ *     m.offset(4)      #=> [6, 7]
+ *
+ *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
+ *     p m.offset(:foo) #=> [0, 1]
+ *     p m.offset(:bar) #=> [2, 3]
+ *
+ */
+
+static int
+match_backref_number(VALUE match, VALUE backref, bool check)
+{
+    const char *name;
+
+    switch (TYPE(backref)) {
+	default:
+	    {
+		const int pos = NUM2INT(backref);
+		if (check) {
+		    if (pos < 0 || pos >= RMATCH(match)->results_count) {
+			rb_raise(rb_eIndexError,
+				"index %d out of matches", pos);
+		    }
+		}
+		return pos;
+	    }
+
+	case T_SYMBOL:
+	    name = rb_sym2name(backref);
+	    break;
+
+	case T_STRING:
+	    name = StringValueCStr(backref);
+	    break;
     }
-    if (nth < 0) {
-	nth += RMATCH(match)->results_count;
-	if (nth <= 0) {
-	    return Qnil;
+
+    // TODO
+    rb_raise(rb_eIndexError, "named captures are not yet supported");
+}
+ 
+static VALUE
+match_offset(VALUE rcv, SEL sel, VALUE backref)
+{
+    const int pos = match_backref_number(rcv, backref, true);
+    return rb_assoc_new(INT2FIX(RMATCH(rcv)->results[pos].beg),
+	    INT2FIX(RMATCH(rcv)->results[pos].end));
+}
+
+/*
+ *  call-seq:
+ *     mtch.begin(n)   => integer
+ *
+ *  Returns the offset of the start of the <em>n</em>th element of the match
+ *  array in the string.
+ *  <em>n</em> can be a string or symbol to reference a named capture.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.begin(0)       #=> 1
+ *     m.begin(2)       #=> 2
+ *
+ *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
+ *     p m.begin(:foo)  #=> 0
+ *     p m.begin(:bar)  #=> 2
+ */
+
+static VALUE
+match_begin(VALUE rcv, SEL sel, VALUE backref)
+{
+    const int pos = match_backref_number(rcv, backref, true);
+    return INT2FIX(RMATCH(rcv)->results[pos].beg);
+}
+
+/*
+ *  call-seq:
+ *     mtch.end(n)   => integer
+ *
+ *  Returns the offset of the character immediately following the end of the
+ *  <em>n</em>th element of the match array in the string.
+ *  <em>n</em> can be a string or symbol to reference a named capture.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.end(0)         #=> 7
+ *     m.end(2)         #=> 3
+ *
+ *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
+ *     p m.end(:foo)    #=> 1
+ *     p m.end(:bar)    #=> 3
+ */
+
+static VALUE
+match_end(VALUE rcv, SEL sel, VALUE backref)
+{
+    const int pos = match_backref_number(rcv, backref, true);
+    return INT2FIX(RMATCH(rcv)->results[pos].end);
+}
+
+/*
+ *  call-seq:
+ *     mtch.to_a   => anArray
+ *
+ *  Returns the array of matches.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
+ *
+ *  Because <code>to_a</code> is called when expanding
+ *  <code>*</code><em>variable</em>, there's a useful assignment
+ *  shortcut for extracting matched fields. This is slightly slower than
+ *  accessing the fields directly (as an intermediate array is
+ *  generated).
+ *
+ *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
+ *     all   #=> "HX1138"
+ *     f1    #=> "H"
+ *     f2    #=> "X"
+ *     f3    #=> "113"
+ */
+
+static VALUE
+match_array(VALUE match, int start)
+{
+    const int len = RMATCH(match)->results_count;
+    assert(start >= 0 && start < len);
+    const bool tainted = OBJ_TAINTED(match);
+
+    VALUE ary = rb_ary_new2(len);
+    for (int i = start; i < len; i++) {
+	VALUE str = rb_reg_nth_match(i, match);
+	if (tainted) {
+	    OBJ_TAINT(str);
 	}
+	rb_ary_push(ary, str);
     }
+    return ary;
+}
 
-    const int beg = RMATCH(match)->results[nth].beg;
-    const int len = RMATCH(match)->results[nth].end - beg;
+static VALUE
+match_to_a(VALUE rcv, SEL sel)
+{
+    return match_array(rcv, 0);
+}
 
-    UnicodeString *unistr = RMATCH(match)->unistr;
-    assert(unistr != NULL);
-    assert(beg + len <= unistr->length());
-    const UChar *chars = unistr->getBuffer();
-    return rb_unicode_str_new(&chars[beg], len);
+/*
+ *  call-seq:
+ *     mtch.captures   => array
+ *
+ *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
+ *
+ *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
+ *     f1    #=> "H"
+ *     f2    #=> "X"
+ *     f3    #=> "113"
+ *     f4    #=> "8"
+ */
+
+static VALUE
+match_captures(VALUE rcv, SEL sel)
+{
+    return match_array(rcv, 1);
 }
 
+/*
+ *  call-seq:
+ *     mtch[i]               => str or nil
+ *     mtch[start, length]   => array
+ *     mtch[range]           => array
+ *     mtch[name]            => str or nil
+ *
+ *  Match Reference---<code>MatchData</code> acts as an array, and may be
+ *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
+ *  equivalent to the special variable <code>$&</code>, and returns the entire
+ *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
+ *  of the matched backreferences (portions of the pattern between parentheses).
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
+ *     m[0]       #=> "HX1138"
+ *     m[1, 2]    #=> ["H", "X"]
+ *     m[1..3]    #=> ["H", "X", "113"]
+ *     m[-3, 2]   #=> ["X", "113"]
+ *
+ *     m = /(?<foo>a+)b/.match("ccaaab")
+ *     m          #=> #<MatchData "aaab" foo:"aaa">
+ *     m["foo"]   #=> "aaa"
+ *     m[:foo]    #=> "aaa"
+ */
+
+static VALUE
+match_aref(VALUE rcv, SEL sel, int argc, VALUE *argv)
+{
+    VALUE backref, rest;
+
+    rb_scan_args(argc, argv, "11", &backref, &rest);
+
+    if (NIL_P(rest)) {
+	const int pos = match_backref_number(rcv, backref, false);
+	return rb_reg_nth_match(pos, rcv);
+    }
+
+    return rb_ary_aref(match_to_a(rcv, 0), 0, argc, argv);
+}
+
+/*
+ *  call-seq:
+ *
+ *     mtch.values_at([index]*)   => array
+ *
+ *  Uses each <i>index</i> to access the matching values, returning an array of
+ *  the corresponding matches.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
+ *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
+ *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
+ */
+
+static VALUE
+match_entry(VALUE match, long n)
+{
+    return rb_reg_nth_match(n, match);
+}
+
+static VALUE
+match_values_at(VALUE rcv, SEL sel, int argc, VALUE *argv)
+{
+    return rb_get_values_at(rcv, RMATCH(rcv)->results_count, argc, argv,
+	    match_entry);
+}
+
+/*
+ *  call-seq:
+ *     mtch.pre_match   => str
+ *
+ *  Returns the portion of the original string before the current match.
+ *  Equivalent to the special variable <code>$`</code>.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.pre_match   #=> "T"
+ */
+
+static VALUE
+match_pre(VALUE rcv, SEL sel)
+{
+    assert(RMATCH(rcv)->results_count > 0);
+
+    VALUE str = unistr_subseq(RMATCH(rcv)->unistr, 0,
+	    RMATCH(rcv)->results[0].beg);
+
+    if (OBJ_TAINTED(rcv)) {
+	OBJ_TAINT(str);
+    }
+    return str;
+}
+
 VALUE
-rb_reg_last_match(VALUE match)
+rb_reg_match_pre(VALUE rcv)
 {
-    return rb_reg_nth_match(0, match);
+    if (NIL_P(rcv)) {
+	return Qnil;
+    }
+    return match_pre(rcv, 0);
 }
 
 /*
+ *  call-seq:
+ *     mtch.post_match   => str
+ *
+ *  Returns the portion of the original string after the current match.
+ *  Equivalent to the special variable <code>$'</code>.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
+ *     m.post_match   #=> ": The Movie"
+ */
+
+static VALUE
+match_post(VALUE rcv, SEL sel)
+{
+    assert(RMATCH(rcv)->results_count > 0);
+
+    const int pos = RMATCH(rcv)->results[0].end;
+    VALUE str = unistr_subseq(RMATCH(rcv)->unistr, pos,
+	    RMATCH(rcv)->unistr->length() - pos);
+
+    if (OBJ_TAINTED(rcv)) {
+	OBJ_TAINT(str);
+    }
+    return str;
+}
+
+VALUE
+rb_reg_match_post(VALUE rcv)
+{
+    if (NIL_P(rcv)) {
+	return Qnil;
+    }
+    return match_post(rcv, 0);
+}
+
+VALUE
+rb_reg_match_last(VALUE rcv)
+{
+    if (NIL_P(rcv)) {
+	return Qnil;
+    }
+    assert(RMATCH(rcv)->results_count > 0);
+    return rb_reg_nth_match(RMATCH(rcv)->results_count - 1, rcv);
+}
+
+/*
  * call-seq:
  *    mtch.inspect   => str
  *
@@ -792,6 +1171,34 @@
  *
  */
 
+VALUE
+rb_reg_nth_match(int nth, VALUE match)
+{
+    if (NIL_P(match)) {
+	return Qnil;
+    }
+    if (nth >= RMATCH(match)->results_count) {
+	return Qnil;
+    }
+    if (nth < 0) {
+	nth += RMATCH(match)->results_count;
+	if (nth <= 0) {
+	    return Qnil;
+	}
+    }
+
+    const int beg = RMATCH(match)->results[nth].beg;
+    const int len = RMATCH(match)->results[nth].end - beg;
+
+    return unistr_subseq(RMATCH(match)->unistr, beg, len);
+}
+
+VALUE
+rb_reg_last_match(VALUE match)
+{
+    return rb_reg_nth_match(0, match);
+}
+
 static VALUE
 match_inspect(VALUE rcv, SEL sel)
 {
@@ -876,33 +1283,34 @@
 Init_Match(void)
 {
     rb_cMatch = rb_define_class("MatchData", rb_cObject);
-    rb_objc_define_method(*(VALUE *)rb_cMatch, "alloc", (void *)match_alloc, 0);
     rb_undef_method(CLASS_OF(rb_cMatch), "new");
 
-    match_finalize_imp_super = rb_objc_install_method2((Class)rb_cMatch,
-	    "finalize", (IMP)match_finalize_imp);
-
-#if 0
-    rb_objc_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
-    rb_objc_define_method(rb_cMatch, "regexp", match_regexp, 0);
-    rb_objc_define_method(rb_cMatch, "names", match_names, 0);
-    rb_objc_define_method(rb_cMatch, "size", match_size, 0);
-    rb_objc_define_method(rb_cMatch, "length", match_size, 0);
-    rb_objc_define_method(rb_cMatch, "offset", match_offset, 1);
-    rb_objc_define_method(rb_cMatch, "begin", match_begin, 1);
-    rb_objc_define_method(rb_cMatch, "end", match_end, 1);
-    rb_objc_define_method(rb_cMatch, "to_a", match_to_a, 0);
-    rb_objc_define_method(rb_cMatch, "[]", match_aref, -1);
-    rb_objc_define_method(rb_cMatch, "captures", match_captures, 0);
-    rb_objc_define_method(rb_cMatch, "values_at", match_values_at, -1);
-    rb_objc_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
-    rb_objc_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
-#endif
+    rb_objc_define_method(*(VALUE *)rb_cMatch, "alloc", (void *)match_alloc, 0);
+    rb_objc_define_method(rb_cMatch, "initialize_copy",
+	    (void *)match_initialize_copy, 1);
+    rb_objc_define_method(rb_cMatch, "regexp", (void *)match_regexp, 0);
+    rb_objc_define_method(rb_cMatch, "names", (void *)match_names, 0);
+    rb_objc_define_method(rb_cMatch, "size", (void *)match_size, 0);
+    rb_objc_define_method(rb_cMatch, "length", (void *)match_size, 0);
+    rb_objc_define_method(rb_cMatch, "offset", (void *)match_offset, 1);
+    rb_objc_define_method(rb_cMatch, "begin", (void *)match_begin, 1);
+    rb_objc_define_method(rb_cMatch, "end", (void *)match_end, 1);
+    rb_objc_define_method(rb_cMatch, "to_a", (void *)match_to_a, 0);
+    rb_objc_define_method(rb_cMatch, "captures", (void *)match_captures, 0);
+    rb_objc_define_method(rb_cMatch, "[]", (void *)match_aref, -1);
+    rb_objc_define_method(rb_cMatch, "values_at", (void *)match_values_at, -1);
+    rb_objc_define_method(rb_cMatch, "pre_match", (void *)match_pre, 0);
+    rb_objc_define_method(rb_cMatch, "post_match", (void *)match_post, 0);
     rb_objc_define_method(rb_cMatch, "to_s", (void *)match_to_s, 0);
     rb_objc_define_method(rb_cMatch, "string", (void *)match_string, 0);
     rb_objc_define_method(rb_cMatch, "inspect", (void *)match_inspect, 0);
+
+    match_finalize_imp_super = rb_objc_install_method2((Class)rb_cMatch,
+	    "finalize", (IMP)match_finalize_imp);
 }
 
+// MRI compatibility.
+
 VALUE
 rb_reg_check_preprocess(VALUE str)
 {
@@ -937,24 +1345,6 @@
     return rb_reg_new_str(rb_usascii_str_new(cstr, len), options);
 }
 
-VALUE
-rb_reg_match_last(VALUE match)
-{
-    return Qnil;
-}
-
-VALUE
-rb_reg_match_pre(VALUE match, SEL sel)
-{
-    return Qnil;
-}
-
-VALUE
-rb_reg_match_post(VALUE match, SEL sel)
-{
-    return Qnil;
-}
-
 void
 rb_match_busy(VALUE match)
 {

Modified: MacRuby/branches/icu/vm.cpp
===================================================================
--- MacRuby/branches/icu/vm.cpp	2010-02-19 03:58:06 UTC (rev 3576)
+++ MacRuby/branches/icu/vm.cpp	2010-02-19 06:17:08 UTC (rev 3577)
@@ -3341,9 +3341,6 @@
 }
 #endif
 
-extern "C" VALUE rb_reg_match_pre(VALUE match, SEL sel);
-extern "C" VALUE rb_reg_match_post(VALUE match, SEL sel);
-
 extern "C"
 VALUE
 rb_vm_get_special(char code)
@@ -3359,10 +3356,10 @@
 	    val = rb_reg_last_match(backref);
 	    break;
 	case '`':
-	    val = rb_reg_match_pre(backref, 0);
+	    val = rb_reg_match_pre(backref);
 	    break;
 	case '\'':
-	    val = rb_reg_match_post(backref, 0);
+	    val = rb_reg_match_post(backref);
 	    break;
 	case '+':
 	    val = rb_reg_match_last(backref);
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20100218/3eda3c84/attachment-0001.html>


More information about the macruby-changes mailing list