[macruby-changes] [5180] MacRuby/trunk
source_changes at macosforge.org
source_changes at macosforge.org
Mon Jan 17 17:55:06 PST 2011
Revision: 5180
http://trac.macosforge.org/projects/ruby/changeset/5180
Author: lsansonetti at apple.com
Date: 2011-01-17 17:55:01 -0800 (Mon, 17 Jan 2011)
Log Message:
-----------
String#split should be faster on UTF-8 strings containing multibyte characters
Modified Paths:
--------------
MacRuby/trunk/encoding_ucnv.h
MacRuby/trunk/string.c
MacRuby/trunk/ucnv.c
Modified: MacRuby/trunk/encoding_ucnv.h
===================================================================
--- MacRuby/trunk/encoding_ucnv.h 2011-01-18 00:18:59 UTC (rev 5179)
+++ MacRuby/trunk/encoding_ucnv.h 2011-01-18 01:55:01 UTC (rev 5180)
@@ -23,7 +23,6 @@
void str_ucnv_update_flags(rb_str_t *self);
long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
-long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
void str_ucnv_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback);
Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c 2011-01-18 00:18:59 UTC (rev 5179)
+++ MacRuby/trunk/string.c 2011-01-18 01:55:01 UTC (rev 5180)
@@ -574,7 +574,8 @@
}
static character_boundaries_t
-str_get_character_boundaries(rb_str_t *self, long index, character_boundaries_cache_t *cache)
+str_get_character_boundaries(rb_str_t *self, long index,
+ character_boundaries_cache_t *cache)
{
character_boundaries_t boundaries = {-1, -1};
@@ -1030,29 +1031,7 @@
return self->length_in_bytes > str->length_in_bytes ? 1 : -1;
}
-
static long
-str_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes)
-{
- if ((offset_in_bytes >= self->length_in_bytes) || (offset_in_bytes < 0)) {
- return -1;
- }
- if (offset_in_bytes == 0) {
- return 0;
- }
-
- if (self->encoding->single_byte_encoding) {
- return offset_in_bytes;
- }
- else if (IS_UTF16_ENC(self->encoding)) {
- return BYTES_TO_UCHARS(offset_in_bytes);
- }
- else {
- return str_ucnv_offset_in_bytes_to_index(self, offset_in_bytes, true);
- }
-}
-
-static long
str_offset_in_bytes_for_string(rb_str_t *self, rb_str_t *searched,
long start_offset_in_bytes, long end_offset_in_bytes,
bool backward_search)
@@ -1071,8 +1050,7 @@
return -1;
}
- long increment = self->encoding->min_char_size;
-
+ const long increment = self->encoding->min_char_size;
if (backward_search) {
for (long offset = end_offset_in_bytes - increment;
offset >= start_offset_in_bytes;
@@ -1100,8 +1078,9 @@
}
static long
-str_index_for_string(rb_str_t *self, rb_str_t *searched, long start_index,
- long end_index, bool backward_search)
+str_index_for_string_with_cache(rb_str_t *self, rb_str_t *searched,
+ long start_index, long end_index, bool backward_search,
+ character_boundaries_cache_t *local_cache)
{
str_must_have_compatible_encoding(self, searched);
@@ -1109,16 +1088,13 @@
return start_index;
}
- character_boundaries_cache_t local_cache;
- reset_character_boundaries_cache(&local_cache);
-
long start_offset_in_bytes;
if (start_index == 0) {
start_offset_in_bytes = 0;
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
- start_index, &local_cache);
+ start_index, local_cache);
if (boundaries.start_offset_in_bytes == -1) {
if (boundaries.end_offset_in_bytes == -1) {
return -1;
@@ -1132,12 +1108,13 @@
}
long end_offset_in_bytes;
- if (end_index < 0 || end_index == str_length(self)) {
+ if (end_index < 0
+ || end_index == str_length_with_cache(self, local_cache)) {
end_offset_in_bytes = self->length_in_bytes;
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
- end_index, &local_cache);
+ end_index, local_cache);
if (boundaries.start_offset_in_bytes == -1) {
if (boundaries.end_offset_in_bytes == -1) {
return -1;
@@ -1153,12 +1130,44 @@
const long offset_in_bytes = str_offset_in_bytes_for_string(self,
searched, start_offset_in_bytes, end_offset_in_bytes,
backward_search);
- if (offset_in_bytes == -1) {
+
+ if (offset_in_bytes < 0 || offset_in_bytes >= self->length_in_bytes) {
return -1;
}
- return str_offset_in_bytes_to_index(RSTR(self), offset_in_bytes);
+ if (self->encoding->single_byte_encoding) {
+ return offset_in_bytes;
+ }
+ else if (IS_UTF16_ENC(self->encoding)) {
+ return BYTES_TO_UCHARS(offset_in_bytes);
+ }
+
+ // Slow path: convert the bytes index to a character index, by guessing.
+ long index_guess = start_index +
+ ((offset_in_bytes - start_offset_in_bytes) / 2);
+ while (true) {
+ character_boundaries_t boundaries = str_get_character_boundaries(self,
+ index_guess, local_cache);
+ assert(boundaries.start_offset_in_bytes <= offset_in_bytes);
+ if (boundaries.start_offset_in_bytes == offset_in_bytes) {
+ break;
+ }
+ long new_guess = (offset_in_bytes
+ - boundaries.start_offset_in_bytes) / 2;
+ index_guess = new_guess > index_guess ? new_guess : index_guess + 1;
+ }
+ return index_guess;
}
+static long
+str_index_for_string(rb_str_t *self, rb_str_t *searched, long start_index,
+ long end_index, bool backward_search)
+{
+ character_boundaries_cache_t local_cache;
+ reset_character_boundaries_cache(&local_cache);
+ return str_index_for_string_with_cache(self, searched, start_index,
+ end_index, backward_search, &local_cache);
+}
+
static bool
str_include_string(rb_str_t *self, rb_str_t *searched)
{
@@ -3155,7 +3164,9 @@
static VALUE
rstr_split(VALUE str, SEL sel, int argc, VALUE *argv)
{
- const long len = str_length(RSTR(str));
+ character_boundaries_cache_t local_cache;
+ reset_character_boundaries_cache(&local_cache);
+ const long len = str_length_with_cache(RSTR(str), &local_cache);
int lim = 0;
VALUE spat, limit;
@@ -3208,7 +3219,8 @@
for (long i = 0; i < chars_len; i++) {
UChar c = chars[i];
if (c == ' ' || c == '\t' || c == '\n' || c == '\v') {
- VALUE substr = rstr_substr(str, beg, i - beg);
+ VALUE substr = rstr_substr_with_cache(str, beg, i - beg,
+ &local_cache);
str_strip(substr, 0);
if (rb_str_chars_len(substr) > 0) {
rb_ary_push(result, substr);
@@ -3224,7 +3236,7 @@
else if (spat_string) {
if (spat_len == 0) {
do {
- VALUE substr = rstr_substr(str, beg, 1);
+ VALUE substr = rstr_substr_with_cache(str, beg, 1, &local_cache);
rb_ary_push(result, substr);
beg++;
if (beg >= len) {
@@ -3237,12 +3249,13 @@
rb_str_t *spat_str = str_need_string(spat);
const long spat_len = str_length(spat_str);
do {
- const long pos = str_index_for_string(RSTR(str), spat_str,
- beg, -1, false);
+ const long pos = str_index_for_string_with_cache(RSTR(str),
+ spat_str, beg, -1, false, &local_cache);
if (pos == -1) {
break;
}
- rb_ary_push(result, rstr_substr(str, beg, pos - beg));
+ rb_ary_push(result, rstr_substr_with_cache(str, beg, pos - beg,
+ &local_cache));
beg = pos + spat_len;
}
while (limit == Qnil || --lim > 1);
@@ -3268,7 +3281,8 @@
if (last_null) {
VALUE substr;
if (beg + 1 <= len) {
- substr = rstr_substr(str, beg, 1);
+ substr = rstr_substr_with_cache(str, beg, 1,
+ &local_cache);
}
else {
substr = rb_str_new(NULL, 0);
@@ -3283,7 +3297,9 @@
}
}
else {
- rb_ary_push(result, rstr_substr(str, beg, pos - beg));
+ VALUE substr = rstr_substr_with_cache(str, beg, pos - beg,
+ &local_cache);
+ rb_ary_push(result, substr);
beg = start = results[0].end;
}
last_null = false;
@@ -3297,8 +3313,8 @@
substr = rb_str_new(NULL, 0);
}
else {
- substr = rstr_substr(str, results[i].beg,
- results[i].end - results[i].beg);
+ substr = rstr_substr_with_cache(str, results[i].beg,
+ results[i].end - results[i].beg, &local_cache);
}
rb_ary_push(result, substr);
}
@@ -4671,9 +4687,9 @@
long pos = 0;
do {
long off = str_index_for_string(RSTR(str), rs_str, pos, -1, false);
- if(paragraph && off >= 0) {
+ if (paragraph && off >= 0) {
int i;
- for(i = off + 1; i < len; i++) {
+ for (i = off + 1; i < len; i++) {
UChar c = str_get_uchar(RSTR(str), i);
if (c != '\n') {
break;
Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c 2011-01-18 00:18:59 UTC (rev 5179)
+++ MacRuby/trunk/ucnv.c 2011-01-18 01:55:01 UTC (rev 5180)
@@ -272,61 +272,6 @@
return boundaries;
}
-long
-str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
- bool ucs2_mode)
-{
- // the code has many similarities with str_length
- USE_CONVERTER(cnv, self->encoding);
-
- const char *current_position = self->bytes;
- const char *searched_position = current_position + offset_in_bytes;
- const char *end = current_position + self->length_in_bytes;
- long index = 0;
- for (;;) {
- const char *character_start_position = current_position;
- // iterate through the string one Unicode code point at a time
- UErrorCode err = U_ZERO_ERROR;
- UChar32 c = ucnv_getNextUChar(cnv, ¤t_position, end, &err);
- if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
- // end of the string
- // should not happen because str_offset_in_bytes_to_index
- // checks before that offset_in_bytes is inferior to the length
- // in bytes
- abort();
- }
- else if (U_FAILURE(err)) {
- long min_char_size = self->encoding->min_char_size;
- long converted_width = current_position - character_start_position;
- long to_add = div_round_up(converted_width, min_char_size);
- if (searched_position < character_start_position + to_add) {
- long difference = searched_position - character_start_position;
- index += (difference / min_char_size);
- break;
- }
- index += to_add;
- }
- else {
- if (searched_position < current_position) {
- break;
- }
- if (ucs2_mode && !U_IS_BMP(c)) {
- index += 2;
- }
- else {
- ++index;
- }
- }
- if (searched_position == current_position) {
- break;
- }
- }
-
- ucnv_close(cnv);
-
- return index;
-}
-
void
str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
rb_str_t *self, long *pos,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20110117/efcb2d69/attachment-0001.html>
More information about the macruby-changes
mailing list