[macruby-changes] [3587] MacRuby/branches/icu

Mon Feb 22 23:58:13 PST 2010

Revision: 3587
          http://trac.macosforge.org/projects/ruby/changeset/3587
Author:   lsansonetti at apple.com
Date:     2010-02-22 23:58:13 -0800 (Mon, 22 Feb 2010)
Log Message:
-----------
more string work

Modified Paths:
--------------
    MacRuby/branches/icu/encoding.h
    MacRuby/branches/icu/string.c

Modified: MacRuby/branches/icu/encoding.h
===================================================================

--- MacRuby/branches/icu/encoding.h	2010-02-20 23:09:36 UTC (rev 3586)
+++ MacRuby/branches/icu/encoding.h	2010-02-23 07:58:13 UTC (rev 3587)
@@ -281,10 +281,14 @@
 	    STRING_VALID_ENCODING);
 }
 
+// The following functions should always been prefered over anything else,
+// especially if this "else" is RSTRING_PTR and RSTRING_LEN.
+// They also work on CFStrings.
 VALUE rb_unicode_str_new(const UniChar *ptr, const size_t len);
 void rb_str_get_uchars(VALUE str, UChar **chars_p, long *chars_len_p,
 	bool *need_free_p);
 long rb_str_chars_len(VALUE str);
+UChar rb_str_get_uchar(VALUE str, long pos);
 
 VALUE mr_enc_s_is_compatible(VALUE klass, SEL sel, VALUE str1, VALUE str2);
 

Modified: MacRuby/branches/icu/string.c
===================================================================
--- MacRuby/branches/icu/string.c	2010-02-20 23:09:36 UTC (rev 3586)
+++ MacRuby/branches/icu/string.c	2010-02-23 07:58:13 UTC (rev 3587)
@@ -214,6 +214,12 @@
     return str;
 }
 
+static VALUE
+str_new(void)
+{
+    return (VALUE)str_alloc(rb_cRubyString);
+}
+
 static void
 str_replace_with_bytes(rb_str_t *self, const char *bytes, long len,
 	rb_encoding_t *enc)
@@ -677,6 +683,27 @@
 }
 
 static void
+str_delete(rb_str_t *self, long pos, long len, bool ucs2_mode)
+{
+    assert(pos >= 0 && len > 0);
+    const long self_len = str_length(self, ucs2_mode);
+    if (pos + len == self_len) {
+	// We are deleting stuff from the end of the string. We can simply
+	// change the string size here.
+	if (str_is_stored_in_uchars(self)) {
+	    self->length_in_bytes -= UCHARS_TO_BYTES(len);
+	}
+	else {
+	    self->length_in_bytes -= len;
+	}
+    }
+    else {
+	assert(pos + len < self_len);
+	abort(); // TODO
+    }
+}
+
+static void
 str_concat_string(rb_str_t *self, rb_str_t *str)
 {
     if (str->length_in_bytes == 0) {
@@ -948,6 +975,9 @@
     if (len < 0) {
 	return Qnil;
     }
+    if (len == 0) {
+	return str_new();
+    }	
 
     const long n = str_length(RSTR(str), true);
     if (beg < 0) {
@@ -964,6 +994,13 @@
     return substr == NULL ? Qnil : (VALUE)substr;
 }
 
+static VALUE
+str_trim(VALUE str)
+{
+    // TODO
+    return str;
+}
+
 //----------------------------------------------
 // Functions called by MacRuby
 
@@ -1935,6 +1972,352 @@
     return block_given ? self : ary;
 }
 
+/*
+ *  call-seq:
+ *     str.split(pattern=$;, [limit])   => anArray
+ *  
+ *  Divides <i>str</i> into substrings based on a delimiter, returning an array
+ *  of these substrings.
+ *     
+ *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
+ *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
+ *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
+ *  of contiguous whitespace characters ignored.
+ *     
+ *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
+ *  pattern matches. Whenever the pattern matches a zero-length string,
+ *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
+ *  groups, the respective matches will be returned in the array as well.
+ *     
+ *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
+ *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
+ *  split on whitespace as if ` ' were specified.
+ *     
+ *  If the <i>limit</i> parameter is omitted, trailing null fields are
+ *  suppressed. If <i>limit</i> is a positive number, at most that number of
+ *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
+ *  string is returned as the only entry in an array). If negative, there is no
+ *  limit to the number of fields returned, and trailing null fields are not
+ *  suppressed.
+ *     
+ *     " now's  the time".split        #=> ["now's", "the", "time"]
+ *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
+ *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
+ *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
+ *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
+ *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
+ *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
+ *     
+ *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
+ *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
+ *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
+ *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
+ */
+
+static VALUE
+rstr_split(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+    const long len = str_length(RSTR(str), false);
+    int lim = 0;
+
+    VALUE spat, limit;
+    if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
+	lim = NUM2INT(limit);
+	if (lim <= 0) {
+	    limit = Qnil;
+	}
+	else if (lim == 1) {
+	    if (len == 0) {
+		return rb_ary_new2(0);
+	    }
+	    return rb_ary_new3(1, str);
+	}
+    }
+
+    VALUE result = rb_ary_new();
+    bool awk_split = false, spat_string = false;
+    long spat_len = 0;
+    if (NIL_P(spat)) {
+	if (!NIL_P(rb_fs)) {
+	    spat = rb_fs;
+	    goto fs_set;
+	}
+	awk_split = true;
+    }
+    else {
+fs_set:
+	if (TYPE(spat) == T_STRING) {
+	    spat_string = true;
+	    spat_len = rb_str_chars_len(spat);
+	    if (spat_len == 1 && rb_str_get_uchar(spat, 0) == ' ') {
+		awk_split = true;
+	    }
+	}
+	else {
+	    spat = get_pat(spat, true);
+	}
+    }
+
+    long beg = 0;
+    if (awk_split || spat_string) {
+	if (spat != Qnil) {
+	    if (spat_len == 0) {
+		do {
+		    VALUE substr = str_substr(str, beg, 1);
+		    rb_ary_push(result, substr);
+		    beg++;
+		    if (beg >= len) {
+			break;
+		    }
+		}
+		while (limit == Qnil || --lim > 1);
+	    }
+	    else {
+		rb_str_t *spat_str = str_need_string(spat);
+		do {
+		    const long pos = str_index_for_string(RSTR(str), spat_str,
+			    beg, false);
+		    if (pos == -1) {
+			break;
+		    }
+		    VALUE substr = str_substr(str, beg, pos - beg);
+		    if (!awk_split || rb_str_chars_len(str_trim(substr)) > 0) {
+			rb_ary_push(result, substr);
+		    }
+		    beg = pos + 1;
+		}
+		while (limit == Qnil || --lim > 1);
+	    }
+	}
+	else {
+	    abort(); // TODO
+	}
+    }
+    else {
+	long start = beg;
+	bool last_null = false;
+	do {
+	    const long pos = rb_reg_search(spat, str, beg, false);
+	    if (pos < 0) {
+		break;
+	    }
+	    VALUE match = rb_backref_get();
+
+	    int count = 0;
+	    rb_match_result_t *results = rb_reg_match_results(match, &count);
+	    assert(count > 0);
+
+	    if (beg == pos && results[0].beg == results[0].end) {
+		if (last_null) {
+		    rb_ary_push(result, str_substr(str, beg, 1));
+		    beg = start;
+		}
+		else {
+		    start++;
+		    last_null = true;
+		    continue;
+		}
+	    }
+	    else {
+		rb_ary_push(result, str_substr(str, beg, pos - beg));
+		beg = results[0].end;
+	    }
+	    last_null = false;
+
+	    for (int i = 1; i < count; i++) {
+		rb_ary_push(result, rb_reg_nth_match(i, match));
+	    }
+	}
+	while (limit == Qnil || --lim > 1);
+    }
+
+    if (len > 0 && (!NIL_P(limit) || len > beg || lim < 0)) {
+	VALUE tmp;
+	if (len == beg) {
+	    tmp = rb_str_new(NULL, 0);
+	}
+	else {
+	    tmp = rb_str_subseq(str, beg, len - beg);
+	}
+	rb_ary_push(result, tmp);
+    }
+
+    if (NIL_P(limit) && lim == 0) {
+	while (true) {
+	    const long n = RARRAY_LEN(result);
+	    if (n > 0 && rb_str_chars_len(RARRAY_AT(result, n - 1)) == 0) {
+		rb_ary_pop(result);
+	    }
+	    else {
+		break;
+	    }
+	}
+    }
+
+    return result;
+}
+
+/*
+ *  call-seq:
+ *     str.to_i(base=10)   => integer
+ *  
+ *  Returns the result of interpreting leading characters in <i>str</i> as an
+ *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
+ *  end of a valid number are ignored. If there is not a valid number at the
+ *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
+ *  exception.
+ *     
+ *     "12345".to_i             #=> 12345
+ *     "99 red balloons".to_i   #=> 99
+ *     "0a".to_i                #=> 0
+ *     "0a".to_i(16)            #=> 10
+ *     "hello".to_i             #=> 0
+ *     "1100101".to_i(2)        #=> 101
+ *     "1100101".to_i(8)        #=> 294977
+ *     "1100101".to_i(10)       #=> 1100101
+ *     "1100101".to_i(16)       #=> 17826049
+ */
+
+static VALUE
+rstr_to_i(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+    int base = 10;
+
+    if (argc > 0) {
+	VALUE b;
+	rb_scan_args(argc, argv, "01", &b);
+
+	base = NUM2INT(b);
+	if (base < 0) {
+	    rb_raise(rb_eArgError, "invalid radix %d", base);
+	}
+    }
+
+    return rb_str_to_inum(str, base, Qfalse);
+}
+
+/*
+ *  call-seq:
+ *     str.hex   => integer
+ *  
+ *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
+ *  (with an optional sign and an optional <code>0x</code>) and returns the
+ *  corresponding number. Zero is returned on error.
+ *     
+ *     "0x0a".hex     #=> 10
+ *     "-1234".hex    #=> -4660
+ *     "0".hex        #=> 0
+ *     "wombat".hex   #=> 0
+ */
+
+static VALUE
+rstr_hex(VALUE str, SEL sel)
+{
+    return rb_str_to_inum(str, 16, Qfalse);
+}
+
+/*
+ *  call-seq:
+ *     str.oct   => integer
+ *  
+ *  Treats leading characters of <i>str</i> as a string of octal digits (with an
+ *  optional sign) and returns the corresponding number.  Returns 0 if the
+ *  conversion fails.
+ *     
+ *     "123".oct       #=> 83
+ *     "-377".oct      #=> -255
+ *     "bad".oct       #=> 0
+ *     "0377bad".oct   #=> 255
+ */
+
+static VALUE
+rstr_oct(VALUE str, SEL sel)
+{
+    return rb_str_to_inum(str, -8, Qfalse);
+}
+
+/*
+ *  call-seq:
+ *     str.chomp!(separator=$/)   => str or nil
+ *  
+ *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
+ *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
+ */
+
+static VALUE
+rstr_chomp_bang(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+    VALUE rs;
+    if (rb_scan_args(argc, argv, "01", &rs) == 0) {
+	rs = rb_rs;
+    }
+    rstr_modify(str);
+    if (rs == Qnil) {
+	return Qnil;
+    }
+    StringValue(rs);
+
+    const long len = str_length(RSTR(str), false);
+    if (len == 0) {
+	return Qnil;
+    }
+
+    const long rslen = rb_str_chars_len(rs);
+    long to_del = 0;
+
+    if (rs == rb_default_rs
+	|| rslen == 0
+	|| (rslen == 1 && rb_str_get_uchar(rs, 0) == '\n')) {
+	UChar c = rb_str_get_uchar(str, len - 1);
+	if (c == '\n') {
+	    to_del++;
+	    c = rb_str_get_uchar(str, len - 2);
+	}
+	if (c == '\r' && (rslen > 0 || to_del != 0)) {
+	    to_del++;
+	}
+    }
+    else if (rslen <= len) {
+	if (str_index_for_string(RSTR(str), str_need_string(rs),
+		    len - rslen, false) >= 0) {
+	    to_del += rslen;
+	}
+    }
+
+    if (to_del == 0) {
+	return Qnil;
+    }
+    str_delete(RSTR(str), len - to_del, to_del, false);
+    return str;
+}
+
+/*
+ *  call-seq:
+ *     str.chomp(separator=$/)   => new_str
+ *  
+ *  Returns a new <code>String</code> with the given record separator removed
+ *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
+ *  changed from the default Ruby record separator, then <code>chomp</code> also
+ *  removes carriage return characters (that is it will remove <code>\n</code>,
+ *  <code>\r</code>, and <code>\r\n</code>).
+ *     
+ *     "hello".chomp            #=> "hello"
+ *     "hello\n".chomp          #=> "hello"
+ *     "hello\r\n".chomp        #=> "hello"
+ *     "hello\n\r".chomp        #=> "hello\n"
+ *     "hello\r".chomp          #=> "hello"
+ *     "hello \n there".chomp   #=> "hello \n there"
+ *     "hello".chomp("llo")     #=> "he"
+ */
+
+static VALUE
+rstr_chomp(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+    str = rb_str_new3(str);
+    rstr_chomp_bang(str, 0, argc, argv);
+    return str;
+}
+
 // NSString primitives.
 
 static CFIndex
@@ -1946,13 +2329,7 @@
 static UniChar
 rstr_imp_characterAtIndex(void *rcv, SEL sel, CFIndex idx)
 {
-    // XXX implement a function that returns a unichar at given index
-    // and use it here.
-    if (str_try_making_data_uchars(RSTR(rcv))) {
-	return RSTR(rcv)->data.uchars[idx];
-    }
-    assert(BINARY_ENC(RSTR(rcv)->encoding));
-    return RSTR(rcv)->data.bytes[idx];
+    return rb_str_get_uchar((VALUE)rcv, idx);
 }
 
 void
@@ -2005,6 +2382,12 @@
     rb_objc_define_method(rb_cRubyString, "match", rstr_match2, -1);
     rb_objc_define_method(rb_cRubyString, "=~", rstr_match, 1);
     rb_objc_define_method(rb_cRubyString, "scan", rstr_scan, 1);
+    rb_objc_define_method(rb_cRubyString, "split", rstr_split, -1);
+    rb_objc_define_method(rb_cRubyString, "to_i", rstr_to_i, -1);
+    rb_objc_define_method(rb_cRubyString, "hex", rstr_hex, 0);
+    rb_objc_define_method(rb_cRubyString, "oct", rstr_oct, 0);
+    rb_objc_define_method(rb_cRubyString, "chomp", rstr_chomp, -1);
+    rb_objc_define_method(rb_cRubyString, "chomp!", rstr_chomp_bang, -1);
 
     // Added for MacRuby (debugging).
     rb_objc_define_method(rb_cRubyString, "__chars_count__",
@@ -2354,6 +2737,21 @@
     }
 }
 
+UChar
+rb_str_get_uchar(VALUE str, long pos)
+{
+    assert(pos >= 0 && pos < rb_str_chars_len(str));
+    if (RSTR(str)) {
+	if (str_try_making_data_uchars(RSTR(str))) {
+	    // FIXME: Not ucs2 compliant.
+	    return RSTR(str)->data.uchars[pos];
+	}
+	assert(BINARY_ENC(RSTR(str)->encoding));
+	return RSTR(str)->data.bytes[pos];
+    }
+    return CFStringGetCharacterAtIndex((CFStringRef)str, pos);
+}
+
 long
 rb_str_chars_len(VALUE str)
 {
@@ -2512,8 +2910,7 @@
     if (IS_RSTR(str)) {
 	return str_substr(str, beg, len);
     }
-    // TODO
-    return Qnil;
+    abort(); // TODO
 }
 
 VALUE
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20100222/66cd29c5/attachment-0001.html>