[macruby-changes] [5130] MacRuby/trunk

Fri Jan 7 05:24:07 PST 2011

Revision: 5130
          http://trac.macosforge.org/projects/ruby/changeset/5130
Author:   vincent.isambart at gmail.com
Date:     2011-01-07 05:24:02 -0800 (Fri, 07 Jan 2011)
Log Message:
-----------
String#scan should now be much faster for non-ASCII strings

rb_str_subseq should also now do what it should.

Next step: Make String#gsub faster

Modified Paths:
--------------
    MacRuby/trunk/encoding.h
    MacRuby/trunk/ext/iconv/iconv.c
    MacRuby/trunk/re.c
    MacRuby/trunk/string.c
    MacRuby/trunk/ucnv.c

Modified: MacRuby/trunk/encoding.h
===================================================================

--- MacRuby/trunk/encoding.h	2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/encoding.h	2011-01-07 13:24:02 UTC (rev 5130)
@@ -109,6 +109,12 @@
     long end_offset_in_bytes;
 } character_boundaries_t;
 
+typedef struct {
+    character_boundaries_t cached_boundaries;
+    long cached_boundaries_index;
+    long cached_length;
+} character_boundaries_cache_t;
+
 typedef struct rb_encoding {
     struct RBasic basic;
     unsigned int index;
@@ -169,6 +175,14 @@
 
 #define ODD_NUMBER(x) ((x) & 0x1)
 
+static inline void
+reset_character_boundaries_cache(character_boundaries_cache_t *cache)
+{
+    assert(cache != NULL);
+    cache->cached_boundaries_index = -1;
+    cache->cached_length = -1;
+}
+
 static inline long
 div_round_up(long a, long b)
 {
@@ -278,6 +292,11 @@
         TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
 }
 
+VALUE rb_str_substr_with_cache(VALUE str, long beg, long len,
+	character_boundaries_cache_t *cache);
+VALUE rb_reg_nth_match_with_cache(int nth, VALUE match,
+	character_boundaries_cache_t *cache);
+
 int rstr_compare(rb_str_t *str1, rb_str_t *str2);
 
 void rb_str_NSCoder_encode(void *coder, VALUE str, const char *key);

Modified: MacRuby/trunk/ext/iconv/iconv.c
===================================================================
--- MacRuby/trunk/ext/iconv/iconv.c	2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/ext/iconv/iconv.c	2011-01-07 13:24:02 UTC (rev 5130)
@@ -941,7 +941,7 @@
 	if (NIL_P(n2) || length < 0) {
 	    length = slen;
 	}
-	str = rb_str_subseq(str, start, length);
+	str = rb_str_substr(str, start, length);
     }
 
     return iconv_convert(VALUE2ICONV(cd), str, 0, -1, rb_enc_get_index(self), NULL);

Modified: MacRuby/trunk/re.c
===================================================================
--- MacRuby/trunk/re.c	2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/re.c	2011-01-07 13:24:02 UTC (rev 5130)
@@ -1852,7 +1852,8 @@
 }
 
 VALUE
-rb_reg_nth_match(int nth, VALUE match)
+rb_reg_nth_match_with_cache(int nth, VALUE match,
+	character_boundaries_cache_t *cache)
 {
     if (NIL_P(match)) {
 	return Qnil;
@@ -1873,10 +1874,16 @@
 	return Qnil;
     }
 
-    return rb_str_substr(RMATCH(match)->str, beg, end - beg);
+    return rb_str_substr_with_cache(RMATCH(match)->str, beg, end - beg, cache);
 }
 
 VALUE
+rb_reg_nth_match(int nth, VALUE match)
+{
+    return rb_reg_nth_match_with_cache(nth, match, NULL);
+}
+
+VALUE
 rb_reg_last_match(VALUE match)
 {
     return rb_reg_nth_match(0, match);

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/string.c	2011-01-07 13:24:02 UTC (rev 5130)
@@ -379,14 +379,28 @@
 }
 
 static long
-str_length(rb_str_t *self)
+str_length_with_cache(rb_str_t *self, character_boundaries_cache_t *cache)
 {
-    if (self->encoding->single_byte_encoding
+    // fast paths
+    if (self->length_in_bytes == 0) {
+	return 0;
+    }
+    else if (self->encoding->single_byte_encoding
 	    || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
 	return self->length_in_bytes;
     }
-    else if (IS_UTF8_ENC(self->encoding)) {
-	long length = 0;
+    else if (IS_UTF16_ENC(self->encoding)) {
+	return div_round_up(self->length_in_bytes, 2);
+    }
+
+    if (cache != NULL
+	    && cache->cached_length >= 0) {
+	return cache->cached_length;
+    }
+
+    // slow paths
+    long length = 0;
+    if (IS_UTF8_ENC(self->encoding)) {
 	int i = 0;
 	while (i < self->length_in_bytes) {
 	    UChar32 c;
@@ -402,16 +416,23 @@
 		length += 2;
 	    }
 	}
-	return length;
     }
-    else if (IS_UTF16_ENC(self->encoding)) {
-	return div_round_up(self->length_in_bytes, 2);
-    }
     else {
-	return str_ucnv_length(self, true);
+	length = str_ucnv_length(self, true);
     }
+
+    if (cache != NULL) {
+	cache->cached_length = length;
+    }
+
+    return length;
 }
 
+static long str_length(rb_str_t *self)
+{
+    return str_length_with_cache(self, NULL);
+}
+
 // Note that each_uchar32 iterates on Unicode characters
 // With a character not in the BMP the callback will only be called once!
 static void
@@ -533,6 +554,7 @@
 str_new_copy_of_part(rb_str_t *self, long offset_in_bytes,
 	long length_in_bytes)
 {
+    assert(length_in_bytes > 0);
     rb_str_t *str = str_alloc(rb_cRubyString);
     str->encoding = self->encoding;
     str->capacity_in_bytes = str->length_in_bytes = length_in_bytes;
@@ -552,10 +574,11 @@
 }
 
 static character_boundaries_t
-str_get_character_boundaries(rb_str_t *self, long index)
+str_get_character_boundaries(rb_str_t *self, long index, character_boundaries_cache_t *cache)
 {
     character_boundaries_t boundaries = {-1, -1};
 
+    // fast paths
     if (self->encoding->single_byte_encoding
 	    || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
 	if (index < 0) {
@@ -566,74 +589,105 @@
 	}
 	boundaries.start_offset_in_bytes = index;
 	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 1;
+
+	return boundaries; // getting the offset is fast so no use caching it
     }
-    else if (IS_UTF8_ENC(self->encoding)) {
-	long pos = 0;
-	int i = 0;
+    else if (IS_UTF16_ENC(self->encoding)) {
 	if (index < 0) {
-	    index += str_length(self);
+	    index += div_round_up(self->length_in_bytes, 2);
 	    if (index < 0) {
 		return boundaries;
 	    }
 	}
-	while (i < self->length_in_bytes) {
+	boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
+	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 2;
+
+	return boundaries; // getting the offset is fast so no use caching it
+    }
+
+    // slow path
+    if (index < 0) {
+	index += str_length_with_cache(self, cache);
+	if (index < 0) {
+	    return boundaries;
+	}
+    }
+
+    bool can_use_cache = (cache != NULL
+	    && cache->cached_boundaries_index >= 0);
+    if (can_use_cache && cache->cached_boundaries_index == index) {
+	return cache->cached_boundaries;
+    }
+
+    if (IS_UTF8_ENC(self->encoding)) {
+	long pos = 0;
+	int index_in_bytes = 0;
+	if (can_use_cache && cache->cached_boundaries_index < index) {
+	    // if we are in the middle of a non-BMP character,
+	    // end_offset_in_bytes or start_offset_in_bytes might be -1
+	    if (cache->cached_boundaries.end_offset_in_bytes == -1) {
+		index_in_bytes = cache->cached_boundaries.start_offset_in_bytes;
+		pos = cache->cached_boundaries_index;
+	    }
+	    else {
+		index_in_bytes = cache->cached_boundaries.end_offset_in_bytes;
+		pos = cache->cached_boundaries_index + 1;
+	    }
+	}
+	while (index_in_bytes < self->length_in_bytes) {
 	    UChar32 c;
-	    int old_i = i;
+	    int old_index_in_bytes = index_in_bytes;
 	    long new_pos = pos;
-	    U8_NEXT(self->bytes, i, self->length_in_bytes, c);
+	    U8_NEXT(self->bytes, index_in_bytes, self->length_in_bytes, c);
 	    if (c == U_SENTINEL) {
-		new_pos += i - old_i;
+		new_pos += index_in_bytes - old_index_in_bytes;
 		if (new_pos > index) {
 		    boundaries.start_offset_in_bytes =
-			old_i + (index - pos);
+			old_index_in_bytes + (index - pos);
 		    boundaries.end_offset_in_bytes =
 			boundaries.start_offset_in_bytes + 1;
-		    return boundaries;
+		    break;
 		}
 	    }
 	    else if (U_IS_BMP(c)) {
 		new_pos++;
 		if (new_pos > index) {
-		    boundaries.start_offset_in_bytes = old_i;
-		    boundaries.end_offset_in_bytes = i;
-		    return boundaries;
+		    boundaries.start_offset_in_bytes = old_index_in_bytes;
+		    boundaries.end_offset_in_bytes = index_in_bytes;
+		    break;
 		}
 	    }
 	    else {
 		new_pos += 2;
 		if (new_pos > index) {
 		    if (index == pos) {
-			boundaries.start_offset_in_bytes = old_i;
+			boundaries.start_offset_in_bytes = old_index_in_bytes;
 		    }
 		    else {
 			assert(index == pos + 1);
-			boundaries.end_offset_in_bytes = i;
+			boundaries.end_offset_in_bytes = index_in_bytes;
 		    }
-		    return boundaries;
+		    break;
 		}
 	    }
 	    pos = new_pos;
 	}
     }
-    else if (IS_UTF16_ENC(self->encoding)) {
-	if (index < 0) {
-	    index += div_round_up(self->length_in_bytes, 2);
-	    if (index < 0) {
-		return boundaries;
-	    }
-	}
-	boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
-	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 2;
-    }
     else {
 	boundaries = str_ucnv_get_character_boundaries(self, index, true);
     }
 
+    if (cache != NULL) {
+	cache->cached_boundaries_index = index;
+	cache->cached_boundaries = boundaries;
+    }
+
     return boundaries;
 }
 
 static rb_str_t *
-str_get_characters(rb_str_t *self, long first, long last)
+str_get_characters(rb_str_t *self, long first, long last,
+	character_boundaries_cache_t *cache)
 {
     if (self->length_in_bytes == 0) {
 	if (first == 0) {
@@ -643,10 +697,17 @@
 	    return NULL;
 	}
     }
+
+    character_boundaries_cache_t local_cache;
+    if (cache == NULL) {
+	reset_character_boundaries_cache(&local_cache);
+	cache = &local_cache;
+    }
+
     character_boundaries_t first_boundaries =
-	str_get_character_boundaries(self, first);
+	str_get_character_boundaries(self, first, cache);
     character_boundaries_t last_boundaries =
-	str_get_character_boundaries(self, last);
+	str_get_character_boundaries(self, last, cache);
 
     if ((first_boundaries.start_offset_in_bytes == -1) ||
 	    (last_boundaries.end_offset_in_bytes == -1)) {
@@ -724,9 +785,12 @@
 	end.start_offset_in_bytes = end.end_offset_in_bytes = offset;
     }
     else {
+	character_boundaries_cache_t local_cache;
+	reset_character_boundaries_cache(&local_cache);
+
 	// Positioning in the string.
-	beg = str_get_character_boundaries(self, pos);
-	end = str_get_character_boundaries(self, pos + len - 1);
+	beg = str_get_character_boundaries(self, pos, &local_cache);
+	end = str_get_character_boundaries(self, pos + len - 1, &local_cache);
 
 	if ((beg.start_offset_in_bytes == -1) ||
 		(end.end_offset_in_bytes == -1)) {
@@ -867,14 +931,18 @@
     str_reset_flags(self);
     self->encoding = enc;
 
+    character_boundaries_cache_t local_cache;
+    reset_character_boundaries_cache(&local_cache);
+
     character_boundaries_t first_boundaries =
-	str_get_character_boundaries(self, start);
+	str_get_character_boundaries(self, start, &local_cache);
     character_boundaries_t last_boundaries;
     if (len == 1) {
 	last_boundaries = first_boundaries;
     }
     else {
-	last_boundaries = str_get_character_boundaries(self, start+len-1);
+	last_boundaries = str_get_character_boundaries(self, start+len-1,
+		&local_cache);
     }
 
     if ((first_boundaries.start_offset_in_bytes == -1) ||
@@ -1043,13 +1111,16 @@
 	return start_index;
     }
 
+    character_boundaries_cache_t local_cache;
+    reset_character_boundaries_cache(&local_cache);
+
     long start_offset_in_bytes;
     if (start_index == 0) {
 	start_offset_in_bytes = 0;
     }
     else {
 	character_boundaries_t boundaries = str_get_character_boundaries(self,
-		start_index);
+		start_index, &local_cache);
 	if (boundaries.start_offset_in_bytes == -1) {
 	    if (boundaries.end_offset_in_bytes == -1) {
 		return -1;
@@ -1068,7 +1139,7 @@
     }
     else {
 	character_boundaries_t boundaries = str_get_character_boundaries(self,
-		end_index);
+		end_index, &local_cache);
 	if (boundaries.start_offset_in_bytes == -1) {
 	    if (boundaries.end_offset_in_bytes == -1) {
 		return -1;
@@ -1256,13 +1327,14 @@
 }
 
 static VALUE
-rstr_substr(VALUE str, long beg, long len)
+rstr_substr_with_cache(VALUE str, long beg, long len,
+	character_boundaries_cache_t *cache)
 {
     if (len < 0) {
 	return Qnil;
     }
 
-    const long n = str_length(RSTR(str));
+    const long n = str_length_with_cache(RSTR(str), cache);
     if (beg < 0) {
 	beg += n;
     }
@@ -1276,11 +1348,17 @@
 	len = n - beg;
     }
 
-    rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1);
+    rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, cache);
     OBJ_INFECT(substr, str);
     return substr == NULL ? Qnil : (VALUE)substr;
 }
 
+static VALUE
+rstr_substr(VALUE str, long beg, long len)
+{
+    return rstr_substr_with_cache(str, beg, len, NULL);
+}
+
 static void
 rstr_splice(VALUE self, long beg, long len, VALUE str)
 {
@@ -2961,6 +3039,9 @@
     pat = get_pat(pat, true);
     const bool tainted = OBJ_TAINTED(self) || OBJ_TAINTED(pat);
 
+    character_boundaries_cache_t local_cache;
+    reset_character_boundaries_cache(&local_cache);
+
     VALUE ary = 0;
     if (!block_given) {
 	ary = rb_ary_new();
@@ -2986,7 +3067,7 @@
 
 	VALUE scan_result;
 	if (count == 1) {
-	    scan_result = rb_reg_nth_match(0, match);
+	    scan_result = rb_reg_nth_match_with_cache(0, match, &local_cache);
 	    if (tainted) {
 		OBJ_TAINT(scan_result);
 	    }
@@ -2994,7 +3075,8 @@
 	else {
 	    scan_result = rb_ary_new2(count);
 	    for (int i = 1; i < count; i++) {
-		VALUE substr = rb_reg_nth_match(i, match);
+		VALUE substr = rb_reg_nth_match_with_cache(i, match,
+			&local_cache);
 		if (tainted) {
 		    OBJ_TAINT(tainted);
 		}
@@ -3226,7 +3308,7 @@
 	    tmp = rb_str_new(NULL, 0);
 	}
 	else {
-	    tmp = rb_str_subseq(str, beg, len - beg);
+	    tmp = rb_str_substr(str, beg, len - beg);
 	}
 	rb_ary_push(result, tmp);
     }
@@ -6451,16 +6533,33 @@
 VALUE
 rb_str_subseq(VALUE str, long beg, long len)
 {
+    assert(IS_RSTR(str) && beg >= 0 && len >= 0
+	    && RSTR(str)->length_in_bytes <= len + beg);
+    VALUE subseq;
+    if (len == 0) {
+	subseq = (VALUE)str_new_similar_empty_string(RSTR(str));
+    }
+    else {
+	subseq = (VALUE)str_new_copy_of_part(RSTR(str), beg, len);
+    }
+    OBJ_INFECT(subseq, str);
+    return subseq;
+}
+
+VALUE
+rb_str_substr_with_cache(VALUE str, long beg, long len,
+	character_boundaries_cache_t *cache)
+{
     if (!IS_RSTR(str)) {
 	str = (VALUE)str_need_string(str);
     }
-    return rstr_substr(str, beg, len);
+    return rstr_substr_with_cache(str, beg, len, cache);
 }
 
 VALUE
 rb_str_substr(VALUE str, long beg, long len)
 {
-    return rb_str_subseq(str, beg, len);
+    return rb_str_substr_with_cache(str, beg, len, NULL);
 }
 
 void

Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c	2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/ucnv.c	2011-01-07 13:24:02 UTC (rev 5130)
@@ -186,15 +186,8 @@
 str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode)
 {
     character_boundaries_t boundaries = {-1, -1};
+    assert(index >= 0);
 
-    if (index < 0) {
-	// calculating the length is slow but we don't have much choice
-	index += str_ucnv_length(self, ucs2_mode);
-	if (index < 0) {
-	    return boundaries;
-	}
-    }
-
     // the code has many similarities with str_length
     USE_CONVERTER(cnv, self->encoding);
 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20110107/4e621173/attachment-0001.html>