[macruby-changes] [5200] MacRuby/trunk
source_changes at macosforge.org
source_changes at macosforge.org
Thu Jan 27 20:48:39 PST 2011
Revision: 5200
http://trac.macosforge.org/projects/ruby/changeset/5200
Author: vincent.isambart at gmail.com
Date: 2011-01-27 20:48:38 -0800 (Thu, 27 Jan 2011)
Log Message:
-----------
String#rindex should work better (even though it may be a bit slower in some cases)
Modified Paths:
--------------
MacRuby/trunk/encoding_ucnv.h
MacRuby/trunk/string.c
MacRuby/trunk/ucnv.c
Modified: MacRuby/trunk/encoding_ucnv.h
===================================================================
--- MacRuby/trunk/encoding_ucnv.h 2011-01-28 04:48:31 UTC (rev 5199)
+++ MacRuby/trunk/encoding_ucnv.h 2011-01-28 04:48:38 UTC (rev 5200)
@@ -23,6 +23,7 @@
void str_ucnv_update_flags(rb_str_t *self);
long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
+long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
void str_ucnv_each_uchar32_starting_from(rb_str_t *self,
Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c 2011-01-28 04:48:31 UTC (rev 5199)
+++ MacRuby/trunk/string.c 2011-01-28 04:48:38 UTC (rev 5200)
@@ -1050,49 +1050,29 @@
}
static long
-str_offset_in_bytes_for_string(rb_str_t *self, rb_str_t *searched,
- long start_offset_in_bytes, long end_offset_in_bytes,
- bool backward_search)
+str_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes)
{
- if (start_offset_in_bytes >= self->length_in_bytes) {
+ if ((offset_in_bytes >= self->length_in_bytes) || (offset_in_bytes < 0)) {
return -1;
}
- if (self == searched && start_offset_in_bytes == 0) {
+ if (offset_in_bytes == 0) {
return 0;
}
- if (searched->length_in_bytes == 0) {
- return backward_search ? end_offset_in_bytes : start_offset_in_bytes;
+
+ if (self->encoding->single_byte_encoding
+ || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
+ return offset_in_bytes;
}
- str_must_have_compatible_encoding(self, searched);
- if (searched->length_in_bytes > self->length_in_bytes) {
- return -1;
- }
-
- const long increment = self->encoding->min_char_size;
- if (backward_search) {
- for (long offset = end_offset_in_bytes - increment;
- offset >= start_offset_in_bytes;
- offset -= increment) {
- if (memcmp(self->bytes + offset, searched->bytes,
- searched->length_in_bytes) == 0) {
- return offset;
- }
+ else if (IS_UTF16_ENC(self->encoding)) {
+ if (!ODD_NUMBER(offset_in_bytes)) {
+ // if we are in the middle of a character, there's no valid index
+ return -1;
}
+ return BYTES_TO_UCHARS(offset_in_bytes);
}
else {
- const long max_offset_in_bytes = end_offset_in_bytes
- - searched->length_in_bytes + 1;
-
- for (long offset = start_offset_in_bytes;
- offset < max_offset_in_bytes;
- offset += increment) {
- if (memcmp(self->bytes + offset, searched->bytes,
- searched->length_in_bytes) == 0) {
- return offset;
- }
- }
+ return str_ucnv_offset_in_bytes_to_index(self, offset_in_bytes, true);
}
- return -1;
}
static long
@@ -1105,6 +1085,9 @@
if (searched->length_in_bytes == 0 && self->length_in_bytes == 0) {
return start_index;
}
+ if (searched->length_in_bytes > self->length_in_bytes) {
+ return -1;
+ }
long start_offset_in_bytes;
if (start_index == 0) {
@@ -1125,10 +1108,23 @@
start_offset_in_bytes = boundaries.start_offset_in_bytes;
}
- long end_offset_in_bytes;
+ if (self == searched) {
+ if (start_offset_in_bytes == 0) {
+ return 0;
+ }
+ else {
+ return -1;
+ }
+ }
+
+ if (start_offset_in_bytes >= self->length_in_bytes) {
+ return -1;
+ }
+
+ long last_offset_in_bytes;
if (end_index < 0
|| end_index == str_length_with_cache(self, cache)) {
- end_offset_in_bytes = self->length_in_bytes;
+ last_offset_in_bytes = self->length_in_bytes;
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
@@ -1142,15 +1138,23 @@
str_cannot_cut_surrogate();
}
}
- end_offset_in_bytes = boundaries.end_offset_in_bytes;
+ last_offset_in_bytes = boundaries.start_offset_in_bytes;
}
+ long min_end_offset = self->length_in_bytes - searched->length_in_bytes;
+ if (last_offset_in_bytes > min_end_offset) {
+ last_offset_in_bytes = min_end_offset;
+ }
if (!backward_search) {
+ if (searched->length_in_bytes == 0) {
+ assert(start_index >= 0);
+ return start_index;
+ }
__block long returned_index = -1;
__block long current_index = start_index;
str_each_uchar32_starting_from(self, start_offset_in_bytes,
^(UChar32 c, long character_start_offset, long char_len, bool *stop) {
- if (end_offset_in_bytes - character_start_offset < searched->length_in_bytes) {
+ if (character_start_offset > last_offset_in_bytes) {
// not enough characters left: we could not find the string
*stop = true;
return;
@@ -1166,36 +1170,40 @@
return returned_index;
}
- const long offset_in_bytes = str_offset_in_bytes_for_string(self,
- searched, start_offset_in_bytes, end_offset_in_bytes,
- backward_search);
-
- if (offset_in_bytes < 0 || offset_in_bytes >= self->length_in_bytes) {
- return -1;
+ // backward search
+ if (searched->length_in_bytes == 0) {
+ if (end_index < 0) {
+ return str_length_with_cache(self, cache);
+ }
+ else {
+ return end_index;
+ }
}
- if (self->encoding->single_byte_encoding
- || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
- return offset_in_bytes;
- }
- else if (IS_UTF16_ENC(self->encoding)) {
- return BYTES_TO_UCHARS(offset_in_bytes);
- }
- // Slow path: convert the bytes index to a character index, by guessing.
- long index_guess = start_index +
- ((offset_in_bytes - start_offset_in_bytes) / 2);
- while (true) {
- character_boundaries_t boundaries = str_get_character_boundaries(self,
- index_guess, cache);
- assert(boundaries.start_offset_in_bytes <= offset_in_bytes);
- if (boundaries.start_offset_in_bytes == offset_in_bytes) {
- break;
+ for (;;) {
+ long offset_found = -1;
+ for (long offset = last_offset_in_bytes;
+ offset >= start_offset_in_bytes;
+ --offset) {
+ if (memcmp(self->bytes + offset, searched->bytes,
+ searched->length_in_bytes) == 0) {
+ offset_found = offset;
+ break;
+ }
}
- long new_guess = (offset_in_bytes
- - boundaries.start_offset_in_bytes) / 2;
- index_guess = new_guess > index_guess ? new_guess : index_guess + 1;
+ if (offset_found < 0) {
+ // not found
+ return -1;
+ }
+
+ long index = str_offset_in_bytes_to_index(RSTR(self), offset_found);
+ if (index != -1) {
+ // the offset was valid, at the start of a character
+ return index;
+ }
+
+ last_offset_in_bytes = offset_found - 1;
}
- return index_guess;
}
static long
Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c 2011-01-28 04:48:31 UTC (rev 5199)
+++ MacRuby/trunk/ucnv.c 2011-01-28 04:48:38 UTC (rev 5200)
@@ -274,6 +274,64 @@
return boundaries;
}
+long
+str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
+ bool ucs2_mode)
+{
+ // the code has many similarities with str_length
+ USE_CONVERTER(cnv, self->encoding);
+
+ const char *current_position = self->bytes;
+ const char *searched_position = current_position + offset_in_bytes;
+ const char *end = current_position + self->length_in_bytes;
+ long index = 0;
+ for (;;) {
+ const char *character_start_position = current_position;
+ // iterate through the string one Unicode code point at a time
+ UErrorCode err = U_ZERO_ERROR;
+ UChar32 c = ucnv_getNextUChar(cnv, ¤t_position, end, &err);
+ if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
+ // end of the string
+ // should not happen because str_offset_in_bytes_to_index
+ // checks before that offset_in_bytes is inferior to the length
+ // in bytes
+ abort();
+ }
+ else if (U_FAILURE(err)) {
+ long min_char_size = self->encoding->min_char_size;
+ long converted_width = current_position - character_start_position;
+ long to_add = div_round_up(converted_width, min_char_size);
+ if (searched_position < character_start_position + to_add) {
+ long difference = searched_position - character_start_position;
+ index += (difference / min_char_size);
+ break;
+ }
+ index += to_add;
+ }
+ else {
+ if (searched_position < current_position) {
+ // if we are in the middle of a character
+ // there is no valid index
+ index = -1;
+ break;
+ }
+ if (ucs2_mode && !U_IS_BMP(c)) {
+ index += 2;
+ }
+ else {
+ ++index;
+ }
+ }
+ if (searched_position == current_position) {
+ break;
+ }
+ }
+
+ ucnv_close(cnv);
+
+ return index;
+}
+
void
str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
rb_str_t *self, long *pos,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20110127/66cd53e9/attachment-0001.html>
More information about the macruby-changes
mailing list