[macruby-changes] [5180] MacRuby/trunk

Mon Jan 17 17:55:06 PST 2011

Revision: 5180
          http://trac.macosforge.org/projects/ruby/changeset/5180
Author:   lsansonetti at apple.com
Date:     2011-01-17 17:55:01 -0800 (Mon, 17 Jan 2011)
Log Message:
-----------
String#split should be faster on UTF-8 strings containing multibyte characters

Modified Paths:
--------------
    MacRuby/trunk/encoding_ucnv.h
    MacRuby/trunk/string.c
    MacRuby/trunk/ucnv.c

Modified: MacRuby/trunk/encoding_ucnv.h
===================================================================

--- MacRuby/trunk/encoding_ucnv.h	2011-01-18 00:18:59 UTC (rev 5179)
+++ MacRuby/trunk/encoding_ucnv.h	2011-01-18 01:55:01 UTC (rev 5180)
@@ -23,7 +23,6 @@
 void str_ucnv_update_flags(rb_str_t *self);
 long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
 character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
-long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
 void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
 void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
 void str_ucnv_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback);

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2011-01-18 00:18:59 UTC (rev 5179)
+++ MacRuby/trunk/string.c	2011-01-18 01:55:01 UTC (rev 5180)
@@ -574,7 +574,8 @@
 }
 
 static character_boundaries_t
-str_get_character_boundaries(rb_str_t *self, long index, character_boundaries_cache_t *cache)
+str_get_character_boundaries(rb_str_t *self, long index,
+	character_boundaries_cache_t *cache)
 {
     character_boundaries_t boundaries = {-1, -1};
 
@@ -1030,29 +1031,7 @@
     return self->length_in_bytes > str->length_in_bytes ? 1 : -1;
 }
 
-
 static long
-str_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes)
-{
-    if ((offset_in_bytes >= self->length_in_bytes) || (offset_in_bytes < 0)) {
-	return -1;
-    }
-    if (offset_in_bytes == 0) {
-	return 0;
-    }
-
-    if (self->encoding->single_byte_encoding) {
-	return offset_in_bytes;
-    }
-    else if (IS_UTF16_ENC(self->encoding)) {
-	return BYTES_TO_UCHARS(offset_in_bytes);
-    }
-    else {
-	return str_ucnv_offset_in_bytes_to_index(self, offset_in_bytes, true);
-    }
-}
-
-static long
 str_offset_in_bytes_for_string(rb_str_t *self, rb_str_t *searched,
 	long start_offset_in_bytes, long end_offset_in_bytes,
 	bool backward_search)
@@ -1071,8 +1050,7 @@
 	return -1;
     }
 
-    long increment = self->encoding->min_char_size;
-
+    const long increment = self->encoding->min_char_size;
     if (backward_search) {
 	for (long offset = end_offset_in_bytes - increment;
 		offset >= start_offset_in_bytes;
@@ -1100,8 +1078,9 @@
 }
 
 static long
-str_index_for_string(rb_str_t *self, rb_str_t *searched, long start_index,
-	long end_index, bool backward_search)
+str_index_for_string_with_cache(rb_str_t *self, rb_str_t *searched,
+	long start_index, long end_index, bool backward_search,
+	character_boundaries_cache_t *local_cache)
 {
     str_must_have_compatible_encoding(self, searched);
 
@@ -1109,16 +1088,13 @@
 	return start_index;
     }
 
-    character_boundaries_cache_t local_cache;
-    reset_character_boundaries_cache(&local_cache);
-
     long start_offset_in_bytes;
     if (start_index == 0) {
 	start_offset_in_bytes = 0;
     }
     else {
 	character_boundaries_t boundaries = str_get_character_boundaries(self,
-		start_index, &local_cache);
+		start_index, local_cache);
 	if (boundaries.start_offset_in_bytes == -1) {
 	    if (boundaries.end_offset_in_bytes == -1) {
 		return -1;
@@ -1132,12 +1108,13 @@
     }
 
     long end_offset_in_bytes;
-    if (end_index < 0 || end_index == str_length(self)) {
+    if (end_index < 0
+	    || end_index == str_length_with_cache(self, local_cache)) {
 	end_offset_in_bytes = self->length_in_bytes;
     }
     else {
 	character_boundaries_t boundaries = str_get_character_boundaries(self,
-		end_index, &local_cache);
+		end_index, local_cache);
 	if (boundaries.start_offset_in_bytes == -1) {
 	    if (boundaries.end_offset_in_bytes == -1) {
 		return -1;
@@ -1153,12 +1130,44 @@
     const long offset_in_bytes = str_offset_in_bytes_for_string(self,
 	    searched, start_offset_in_bytes, end_offset_in_bytes,
 	    backward_search);
-    if (offset_in_bytes == -1) {
+
+    if (offset_in_bytes < 0 || offset_in_bytes >= self->length_in_bytes) {
 	return -1;
     }
-    return str_offset_in_bytes_to_index(RSTR(self), offset_in_bytes);
+    if (self->encoding->single_byte_encoding) {
+	return offset_in_bytes;
+    }
+    else if (IS_UTF16_ENC(self->encoding)) {
+	return BYTES_TO_UCHARS(offset_in_bytes);
+    }
+
+    // Slow path: convert the bytes index to a character index, by guessing.
+    long index_guess = start_index +
+	((offset_in_bytes - start_offset_in_bytes) / 2);
+    while (true) {
+	character_boundaries_t boundaries = str_get_character_boundaries(self,
+		index_guess, local_cache);
+	assert(boundaries.start_offset_in_bytes <= offset_in_bytes);
+	if (boundaries.start_offset_in_bytes == offset_in_bytes) {
+	    break;
+	}
+	long new_guess = (offset_in_bytes
+		- boundaries.start_offset_in_bytes) / 2;
+	index_guess = new_guess > index_guess ? new_guess : index_guess + 1;
+    }
+    return index_guess;
 }
 
+static long
+str_index_for_string(rb_str_t *self, rb_str_t *searched, long start_index,
+	long end_index, bool backward_search)
+{
+    character_boundaries_cache_t local_cache;
+    reset_character_boundaries_cache(&local_cache);
+    return str_index_for_string_with_cache(self, searched, start_index,
+	    end_index, backward_search, &local_cache);
+}
+
 static bool
 str_include_string(rb_str_t *self, rb_str_t *searched)
 {
@@ -3155,7 +3164,9 @@
 static VALUE
 rstr_split(VALUE str, SEL sel, int argc, VALUE *argv)
 {
-    const long len = str_length(RSTR(str));
+    character_boundaries_cache_t local_cache;
+    reset_character_boundaries_cache(&local_cache);
+    const long len = str_length_with_cache(RSTR(str), &local_cache);
     int lim = 0;
 
     VALUE spat, limit;
@@ -3208,7 +3219,8 @@
 	for (long i = 0; i < chars_len; i++) {
 	    UChar c = chars[i];
 	    if (c == ' ' || c == '\t' || c == '\n' || c == '\v') {
-		VALUE substr = rstr_substr(str, beg, i - beg);
+		VALUE substr = rstr_substr_with_cache(str, beg, i - beg,
+			&local_cache);
 		str_strip(substr, 0);
 		if (rb_str_chars_len(substr) > 0) {
 		    rb_ary_push(result, substr); 
@@ -3224,7 +3236,7 @@
     else if (spat_string) {
 	if (spat_len == 0) {
 	    do {
-		VALUE substr = rstr_substr(str, beg, 1);
+		VALUE substr = rstr_substr_with_cache(str, beg, 1, &local_cache);
 		rb_ary_push(result, substr);
 		beg++;
 		if (beg >= len) {
@@ -3237,12 +3249,13 @@
 	    rb_str_t *spat_str = str_need_string(spat);
 	    const long spat_len = str_length(spat_str);
 	    do {
-		const long pos = str_index_for_string(RSTR(str), spat_str,
-			beg, -1, false);
+		const long pos = str_index_for_string_with_cache(RSTR(str),
+			spat_str, beg, -1, false, &local_cache);
 		if (pos == -1) {
 		    break;
 		}
-		rb_ary_push(result, rstr_substr(str, beg, pos - beg));
+		rb_ary_push(result, rstr_substr_with_cache(str, beg, pos - beg,
+			    &local_cache));
 		beg = pos + spat_len;
 	    }
 	    while (limit == Qnil || --lim > 1);
@@ -3268,7 +3281,8 @@
 		if (last_null) {
 		    VALUE substr;
 		    if (beg + 1 <= len) {
-			substr = rstr_substr(str, beg, 1);
+			substr = rstr_substr_with_cache(str, beg, 1,
+				&local_cache);
 		    }
 		    else {
 			substr = rb_str_new(NULL, 0);
@@ -3283,7 +3297,9 @@
 		}
 	    }
 	    else {
-		rb_ary_push(result, rstr_substr(str, beg, pos - beg));
+		VALUE substr = rstr_substr_with_cache(str, beg, pos - beg,
+			&local_cache);
+		rb_ary_push(result, substr);
 		beg = start = results[0].end;
 	    }
 	    last_null = false;
@@ -3297,8 +3313,8 @@
 		    substr = rb_str_new(NULL, 0);
 		}
 		else {
-		    substr = rstr_substr(str, results[i].beg,
-			    results[i].end - results[i].beg);
+		    substr = rstr_substr_with_cache(str, results[i].beg,
+			    results[i].end - results[i].beg, &local_cache);
 		}
 		rb_ary_push(result, substr);
 	    }
@@ -4671,9 +4687,9 @@
     long pos = 0;
     do {
 	long off = str_index_for_string(RSTR(str), rs_str, pos, -1, false);
-	if(paragraph && off >= 0) {
+	if (paragraph && off >= 0) {
 	    int i;
-	    for(i = off + 1; i < len; i++) {
+	    for (i = off + 1; i < len; i++) {
 		UChar c = str_get_uchar(RSTR(str), i);
 		if (c != '\n') {
 		    break;

Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c	2011-01-18 00:18:59 UTC (rev 5179)
+++ MacRuby/trunk/ucnv.c	2011-01-18 01:55:01 UTC (rev 5180)
@@ -272,61 +272,6 @@
     return boundaries;
 }
 
-long
-str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
-	bool ucs2_mode)
-{
-    // the code has many similarities with str_length
-    USE_CONVERTER(cnv, self->encoding);
-
-    const char *current_position = self->bytes;
-    const char *searched_position = current_position + offset_in_bytes;
-    const char *end = current_position + self->length_in_bytes;
-    long index = 0;
-    for (;;) {
-	const char *character_start_position = current_position;
-	// iterate through the string one Unicode code point at a time
-	UErrorCode err = U_ZERO_ERROR;
-	UChar32 c = ucnv_getNextUChar(cnv, &current_position, end, &err);
-	if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
-	    // end of the string
-	    // should not happen because str_offset_in_bytes_to_index
-	    // checks before that offset_in_bytes is inferior to the length
-	    // in bytes
-	    abort();
-	}
-	else if (U_FAILURE(err)) {
-	    long min_char_size = self->encoding->min_char_size;
-	    long converted_width = current_position - character_start_position;
-	    long to_add = div_round_up(converted_width, min_char_size);
-	    if (searched_position < character_start_position + to_add) {
-		long difference = searched_position - character_start_position;
-		index += (difference / min_char_size);
-		break;
-	    }
-	    index += to_add;
-	}
-	else {
-	    if (searched_position < current_position) {
-		break;
-	    }
-	    if (ucs2_mode && !U_IS_BMP(c)) {
-		index += 2;
-	    }
-	    else {
-		++index;
-	    }
-	}
-	if (searched_position == current_position) {
-	    break;
-	}
-    }
-
-    ucnv_close(cnv);
-
-    return index;
-}
-
 void
 str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
 	rb_str_t *self, long *pos,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20110117/efcb2d69/attachment-0001.html>