[macruby-changes] [5200] MacRuby/trunk

Thu Jan 27 20:48:39 PST 2011

Revision: 5200
          http://trac.macosforge.org/projects/ruby/changeset/5200
Author:   vincent.isambart at gmail.com
Date:     2011-01-27 20:48:38 -0800 (Thu, 27 Jan 2011)
Log Message:
-----------
String#rindex should work better (even though it may be a bit slower in some cases)

Modified Paths:
--------------
    MacRuby/trunk/encoding_ucnv.h
    MacRuby/trunk/string.c
    MacRuby/trunk/ucnv.c

Modified: MacRuby/trunk/encoding_ucnv.h
===================================================================

--- MacRuby/trunk/encoding_ucnv.h	2011-01-28 04:48:31 UTC (rev 5199)
+++ MacRuby/trunk/encoding_ucnv.h	2011-01-28 04:48:38 UTC (rev 5200)
@@ -23,6 +23,7 @@
 void str_ucnv_update_flags(rb_str_t *self);
 long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
 character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
+long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
 void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
 void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
 void str_ucnv_each_uchar32_starting_from(rb_str_t *self,

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2011-01-28 04:48:31 UTC (rev 5199)
+++ MacRuby/trunk/string.c	2011-01-28 04:48:38 UTC (rev 5200)
@@ -1050,49 +1050,29 @@
 }
 
 static long
-str_offset_in_bytes_for_string(rb_str_t *self, rb_str_t *searched,
-	long start_offset_in_bytes, long end_offset_in_bytes,
-	bool backward_search)
+str_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes)
 {
-    if (start_offset_in_bytes >= self->length_in_bytes) {
+    if ((offset_in_bytes >= self->length_in_bytes) || (offset_in_bytes < 0)) {
 	return -1;
     }
-    if (self == searched && start_offset_in_bytes == 0) {
+    if (offset_in_bytes == 0) {
 	return 0;
     }
-    if (searched->length_in_bytes == 0) {
-	return backward_search ? end_offset_in_bytes : start_offset_in_bytes;
+
+    if (self->encoding->single_byte_encoding
+	    || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
+	return offset_in_bytes;
     }
-    str_must_have_compatible_encoding(self, searched);
-    if (searched->length_in_bytes > self->length_in_bytes) {
-	return -1;
-    }
-
-    const long increment = self->encoding->min_char_size;
-    if (backward_search) {
-	for (long offset = end_offset_in_bytes - increment;
-		offset >= start_offset_in_bytes;
-		offset -= increment) {
-	    if (memcmp(self->bytes + offset, searched->bytes,
-			searched->length_in_bytes) == 0) {
-		return offset;
-	    }
+    else if (IS_UTF16_ENC(self->encoding)) {
+	if (!ODD_NUMBER(offset_in_bytes)) {
+	    // if we are in the middle of a character, there's no valid index
+	    return -1;
 	}
+	return BYTES_TO_UCHARS(offset_in_bytes);
     }
     else {
-	const long max_offset_in_bytes = end_offset_in_bytes
-	    - searched->length_in_bytes + 1;
-
-	for (long offset = start_offset_in_bytes;
-		offset < max_offset_in_bytes;
-		offset += increment) {
-	    if (memcmp(self->bytes + offset, searched->bytes,
-			searched->length_in_bytes) == 0) {
-		return offset;
-	    }
-	}
+	return str_ucnv_offset_in_bytes_to_index(self, offset_in_bytes, true);
     }
-    return -1;
 }
 
 static long
@@ -1105,6 +1085,9 @@
     if (searched->length_in_bytes == 0 && self->length_in_bytes == 0) {
 	return start_index;
     }
+    if (searched->length_in_bytes > self->length_in_bytes) {
+	return -1;
+    }
 
     long start_offset_in_bytes;
     if (start_index == 0) {
@@ -1125,10 +1108,23 @@
 	start_offset_in_bytes = boundaries.start_offset_in_bytes;
     }
 
-    long end_offset_in_bytes;
+    if (self == searched) {
+	if (start_offset_in_bytes == 0) {
+	    return 0;
+	}
+	else {
+	    return -1;
+	}
+    }
+
+    if (start_offset_in_bytes >= self->length_in_bytes) {
+	return -1;
+    }
+
+    long last_offset_in_bytes;
     if (end_index < 0
 	    || end_index == str_length_with_cache(self, cache)) {
-	end_offset_in_bytes = self->length_in_bytes;
+	last_offset_in_bytes = self->length_in_bytes;
     }
     else {
 	character_boundaries_t boundaries = str_get_character_boundaries(self,
@@ -1142,15 +1138,23 @@
 		str_cannot_cut_surrogate();
 	    }
 	}
-	end_offset_in_bytes = boundaries.end_offset_in_bytes;
+	last_offset_in_bytes = boundaries.start_offset_in_bytes;
     }
+    long min_end_offset = self->length_in_bytes - searched->length_in_bytes;
+    if (last_offset_in_bytes > min_end_offset) {
+	last_offset_in_bytes = min_end_offset;
+    }
 
     if (!backward_search) {
+	if (searched->length_in_bytes == 0) {
+	    assert(start_index >= 0);
+	    return start_index;
+	}
 	__block long returned_index = -1;
 	__block long current_index = start_index;
 	str_each_uchar32_starting_from(self, start_offset_in_bytes,
 		^(UChar32 c, long character_start_offset, long char_len, bool *stop) {
-	    if (end_offset_in_bytes - character_start_offset < searched->length_in_bytes) {
+	    if (character_start_offset > last_offset_in_bytes) {
 		// not enough characters left: we could not find the string
 		*stop = true;
 		return;
@@ -1166,36 +1170,40 @@
 	return returned_index;
     }
 
-    const long offset_in_bytes = str_offset_in_bytes_for_string(self,
-	    searched, start_offset_in_bytes, end_offset_in_bytes,
-	    backward_search);
-
-    if (offset_in_bytes < 0 || offset_in_bytes >= self->length_in_bytes) {
-	return -1;
+    // backward search
+    if (searched->length_in_bytes == 0) {
+	if (end_index < 0) {
+	    return str_length_with_cache(self, cache);
+	}
+	else {
+	    return end_index;
+	}
     }
-    if (self->encoding->single_byte_encoding
-	    || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
-	return offset_in_bytes;
-    }
-    else if (IS_UTF16_ENC(self->encoding)) {
-	return BYTES_TO_UCHARS(offset_in_bytes);
-    }
 
-    // Slow path: convert the bytes index to a character index, by guessing.
-    long index_guess = start_index +
-	((offset_in_bytes - start_offset_in_bytes) / 2);
-    while (true) {
-	character_boundaries_t boundaries = str_get_character_boundaries(self,
-		index_guess, cache);
-	assert(boundaries.start_offset_in_bytes <= offset_in_bytes);
-	if (boundaries.start_offset_in_bytes == offset_in_bytes) {
-	    break;
+    for (;;) {
+	long offset_found = -1;
+	for (long offset = last_offset_in_bytes;
+		offset >= start_offset_in_bytes;
+		--offset) {
+	    if (memcmp(self->bytes + offset, searched->bytes,
+			searched->length_in_bytes) == 0) {
+		offset_found = offset;
+		break;
+	    }
 	}
-	long new_guess = (offset_in_bytes
-		- boundaries.start_offset_in_bytes) / 2;
-	index_guess = new_guess > index_guess ? new_guess : index_guess + 1;
+	if (offset_found < 0) {
+	    // not found
+	    return -1;
+	}
+
+	long index = str_offset_in_bytes_to_index(RSTR(self), offset_found);
+	if (index != -1) {
+	    // the offset was valid, at the start of a character
+	    return index;
+	}
+
+	last_offset_in_bytes = offset_found - 1;
     }
-    return index_guess;
 }
 
 static long

Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c	2011-01-28 04:48:31 UTC (rev 5199)
+++ MacRuby/trunk/ucnv.c	2011-01-28 04:48:38 UTC (rev 5200)
@@ -274,6 +274,64 @@
     return boundaries;
 }
 
+long
+str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
+	bool ucs2_mode)
+{
+    // the code has many similarities with str_length
+    USE_CONVERTER(cnv, self->encoding);
+
+    const char *current_position = self->bytes;
+    const char *searched_position = current_position + offset_in_bytes;
+    const char *end = current_position + self->length_in_bytes;
+    long index = 0;
+    for (;;) {
+	const char *character_start_position = current_position;
+	// iterate through the string one Unicode code point at a time
+	UErrorCode err = U_ZERO_ERROR;
+	UChar32 c = ucnv_getNextUChar(cnv, &current_position, end, &err);
+	if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
+	    // end of the string
+	    // should not happen because str_offset_in_bytes_to_index
+	    // checks before that offset_in_bytes is inferior to the length
+	    // in bytes
+	    abort();
+	}
+	else if (U_FAILURE(err)) {
+	    long min_char_size = self->encoding->min_char_size;
+	    long converted_width = current_position - character_start_position;
+	    long to_add = div_round_up(converted_width, min_char_size);
+	    if (searched_position < character_start_position + to_add) {
+		long difference = searched_position - character_start_position;
+		index += (difference / min_char_size);
+		break;
+	    }
+	    index += to_add;
+	}
+	else {
+	    if (searched_position < current_position) {
+		// if we are in the middle of a character
+		// there is no valid index
+		index = -1;
+		break;
+	    }
+	    if (ucs2_mode && !U_IS_BMP(c)) {
+		index += 2;
+	    }
+	    else {
+		++index;
+	    }
+	}
+	if (searched_position == current_position) {
+	    break;
+	}
+    }
+
+    ucnv_close(cnv);
+
+    return index;
+}
+
 void
 str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
 	rb_str_t *self, long *pos,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20110127/66cd53e9/attachment-0001.html>