[macruby-changes] [5130] MacRuby/trunk
source_changes at macosforge.org
source_changes at macosforge.org
Fri Jan 7 05:24:07 PST 2011
Revision: 5130
http://trac.macosforge.org/projects/ruby/changeset/5130
Author: vincent.isambart at gmail.com
Date: 2011-01-07 05:24:02 -0800 (Fri, 07 Jan 2011)
Log Message:
-----------
String#scan should now be much faster for non-ASCII strings
rb_str_subseq should also now do what it should.
Next step: Make String#gsub faster
Modified Paths:
--------------
MacRuby/trunk/encoding.h
MacRuby/trunk/ext/iconv/iconv.c
MacRuby/trunk/re.c
MacRuby/trunk/string.c
MacRuby/trunk/ucnv.c
Modified: MacRuby/trunk/encoding.h
===================================================================
--- MacRuby/trunk/encoding.h 2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/encoding.h 2011-01-07 13:24:02 UTC (rev 5130)
@@ -109,6 +109,12 @@
long end_offset_in_bytes;
} character_boundaries_t;
+typedef struct {
+ character_boundaries_t cached_boundaries;
+ long cached_boundaries_index;
+ long cached_length;
+} character_boundaries_cache_t;
+
typedef struct rb_encoding {
struct RBasic basic;
unsigned int index;
@@ -169,6 +175,14 @@
#define ODD_NUMBER(x) ((x) & 0x1)
+static inline void
+reset_character_boundaries_cache(character_boundaries_cache_t *cache)
+{
+ assert(cache != NULL);
+ cache->cached_boundaries_index = -1;
+ cache->cached_length = -1;
+}
+
static inline long
div_round_up(long a, long b)
{
@@ -278,6 +292,11 @@
TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
}
+VALUE rb_str_substr_with_cache(VALUE str, long beg, long len,
+ character_boundaries_cache_t *cache);
+VALUE rb_reg_nth_match_with_cache(int nth, VALUE match,
+ character_boundaries_cache_t *cache);
+
int rstr_compare(rb_str_t *str1, rb_str_t *str2);
void rb_str_NSCoder_encode(void *coder, VALUE str, const char *key);
Modified: MacRuby/trunk/ext/iconv/iconv.c
===================================================================
--- MacRuby/trunk/ext/iconv/iconv.c 2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/ext/iconv/iconv.c 2011-01-07 13:24:02 UTC (rev 5130)
@@ -941,7 +941,7 @@
if (NIL_P(n2) || length < 0) {
length = slen;
}
- str = rb_str_subseq(str, start, length);
+ str = rb_str_substr(str, start, length);
}
return iconv_convert(VALUE2ICONV(cd), str, 0, -1, rb_enc_get_index(self), NULL);
Modified: MacRuby/trunk/re.c
===================================================================
--- MacRuby/trunk/re.c 2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/re.c 2011-01-07 13:24:02 UTC (rev 5130)
@@ -1852,7 +1852,8 @@
}
VALUE
-rb_reg_nth_match(int nth, VALUE match)
+rb_reg_nth_match_with_cache(int nth, VALUE match,
+ character_boundaries_cache_t *cache)
{
if (NIL_P(match)) {
return Qnil;
@@ -1873,10 +1874,16 @@
return Qnil;
}
- return rb_str_substr(RMATCH(match)->str, beg, end - beg);
+ return rb_str_substr_with_cache(RMATCH(match)->str, beg, end - beg, cache);
}
VALUE
+rb_reg_nth_match(int nth, VALUE match)
+{
+ return rb_reg_nth_match_with_cache(nth, match, NULL);
+}
+
+VALUE
rb_reg_last_match(VALUE match)
{
return rb_reg_nth_match(0, match);
Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c 2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/string.c 2011-01-07 13:24:02 UTC (rev 5130)
@@ -379,14 +379,28 @@
}
static long
-str_length(rb_str_t *self)
+str_length_with_cache(rb_str_t *self, character_boundaries_cache_t *cache)
{
- if (self->encoding->single_byte_encoding
+ // fast paths
+ if (self->length_in_bytes == 0) {
+ return 0;
+ }
+ else if (self->encoding->single_byte_encoding
|| (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
return self->length_in_bytes;
}
- else if (IS_UTF8_ENC(self->encoding)) {
- long length = 0;
+ else if (IS_UTF16_ENC(self->encoding)) {
+ return div_round_up(self->length_in_bytes, 2);
+ }
+
+ if (cache != NULL
+ && cache->cached_length >= 0) {
+ return cache->cached_length;
+ }
+
+ // slow paths
+ long length = 0;
+ if (IS_UTF8_ENC(self->encoding)) {
int i = 0;
while (i < self->length_in_bytes) {
UChar32 c;
@@ -402,16 +416,23 @@
length += 2;
}
}
- return length;
}
- else if (IS_UTF16_ENC(self->encoding)) {
- return div_round_up(self->length_in_bytes, 2);
- }
else {
- return str_ucnv_length(self, true);
+ length = str_ucnv_length(self, true);
}
+
+ if (cache != NULL) {
+ cache->cached_length = length;
+ }
+
+ return length;
}
+static long str_length(rb_str_t *self)
+{
+ return str_length_with_cache(self, NULL);
+}
+
// Note that each_uchar32 iterates on Unicode characters
// With a character not in the BMP the callback will only be called once!
static void
@@ -533,6 +554,7 @@
str_new_copy_of_part(rb_str_t *self, long offset_in_bytes,
long length_in_bytes)
{
+ assert(length_in_bytes > 0);
rb_str_t *str = str_alloc(rb_cRubyString);
str->encoding = self->encoding;
str->capacity_in_bytes = str->length_in_bytes = length_in_bytes;
@@ -552,10 +574,11 @@
}
static character_boundaries_t
-str_get_character_boundaries(rb_str_t *self, long index)
+str_get_character_boundaries(rb_str_t *self, long index, character_boundaries_cache_t *cache)
{
character_boundaries_t boundaries = {-1, -1};
+ // fast paths
if (self->encoding->single_byte_encoding
|| (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
if (index < 0) {
@@ -566,74 +589,105 @@
}
boundaries.start_offset_in_bytes = index;
boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 1;
+
+ return boundaries; // getting the offset is fast so no use caching it
}
- else if (IS_UTF8_ENC(self->encoding)) {
- long pos = 0;
- int i = 0;
+ else if (IS_UTF16_ENC(self->encoding)) {
if (index < 0) {
- index += str_length(self);
+ index += div_round_up(self->length_in_bytes, 2);
if (index < 0) {
return boundaries;
}
}
- while (i < self->length_in_bytes) {
+ boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
+ boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 2;
+
+ return boundaries; // getting the offset is fast so no use caching it
+ }
+
+ // slow path
+ if (index < 0) {
+ index += str_length_with_cache(self, cache);
+ if (index < 0) {
+ return boundaries;
+ }
+ }
+
+ bool can_use_cache = (cache != NULL
+ && cache->cached_boundaries_index >= 0);
+ if (can_use_cache && cache->cached_boundaries_index == index) {
+ return cache->cached_boundaries;
+ }
+
+ if (IS_UTF8_ENC(self->encoding)) {
+ long pos = 0;
+ int index_in_bytes = 0;
+ if (can_use_cache && cache->cached_boundaries_index < index) {
+ // if we are in the middle of a non-BMP character,
+ // end_offset_in_bytes or start_offset_in_bytes might be -1
+ if (cache->cached_boundaries.end_offset_in_bytes == -1) {
+ index_in_bytes = cache->cached_boundaries.start_offset_in_bytes;
+ pos = cache->cached_boundaries_index;
+ }
+ else {
+ index_in_bytes = cache->cached_boundaries.end_offset_in_bytes;
+ pos = cache->cached_boundaries_index + 1;
+ }
+ }
+ while (index_in_bytes < self->length_in_bytes) {
UChar32 c;
- int old_i = i;
+ int old_index_in_bytes = index_in_bytes;
long new_pos = pos;
- U8_NEXT(self->bytes, i, self->length_in_bytes, c);
+ U8_NEXT(self->bytes, index_in_bytes, self->length_in_bytes, c);
if (c == U_SENTINEL) {
- new_pos += i - old_i;
+ new_pos += index_in_bytes - old_index_in_bytes;
if (new_pos > index) {
boundaries.start_offset_in_bytes =
- old_i + (index - pos);
+ old_index_in_bytes + (index - pos);
boundaries.end_offset_in_bytes =
boundaries.start_offset_in_bytes + 1;
- return boundaries;
+ break;
}
}
else if (U_IS_BMP(c)) {
new_pos++;
if (new_pos > index) {
- boundaries.start_offset_in_bytes = old_i;
- boundaries.end_offset_in_bytes = i;
- return boundaries;
+ boundaries.start_offset_in_bytes = old_index_in_bytes;
+ boundaries.end_offset_in_bytes = index_in_bytes;
+ break;
}
}
else {
new_pos += 2;
if (new_pos > index) {
if (index == pos) {
- boundaries.start_offset_in_bytes = old_i;
+ boundaries.start_offset_in_bytes = old_index_in_bytes;
}
else {
assert(index == pos + 1);
- boundaries.end_offset_in_bytes = i;
+ boundaries.end_offset_in_bytes = index_in_bytes;
}
- return boundaries;
+ break;
}
}
pos = new_pos;
}
}
- else if (IS_UTF16_ENC(self->encoding)) {
- if (index < 0) {
- index += div_round_up(self->length_in_bytes, 2);
- if (index < 0) {
- return boundaries;
- }
- }
- boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
- boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 2;
- }
else {
boundaries = str_ucnv_get_character_boundaries(self, index, true);
}
+ if (cache != NULL) {
+ cache->cached_boundaries_index = index;
+ cache->cached_boundaries = boundaries;
+ }
+
return boundaries;
}
static rb_str_t *
-str_get_characters(rb_str_t *self, long first, long last)
+str_get_characters(rb_str_t *self, long first, long last,
+ character_boundaries_cache_t *cache)
{
if (self->length_in_bytes == 0) {
if (first == 0) {
@@ -643,10 +697,17 @@
return NULL;
}
}
+
+ character_boundaries_cache_t local_cache;
+ if (cache == NULL) {
+ reset_character_boundaries_cache(&local_cache);
+ cache = &local_cache;
+ }
+
character_boundaries_t first_boundaries =
- str_get_character_boundaries(self, first);
+ str_get_character_boundaries(self, first, cache);
character_boundaries_t last_boundaries =
- str_get_character_boundaries(self, last);
+ str_get_character_boundaries(self, last, cache);
if ((first_boundaries.start_offset_in_bytes == -1) ||
(last_boundaries.end_offset_in_bytes == -1)) {
@@ -724,9 +785,12 @@
end.start_offset_in_bytes = end.end_offset_in_bytes = offset;
}
else {
+ character_boundaries_cache_t local_cache;
+ reset_character_boundaries_cache(&local_cache);
+
// Positioning in the string.
- beg = str_get_character_boundaries(self, pos);
- end = str_get_character_boundaries(self, pos + len - 1);
+ beg = str_get_character_boundaries(self, pos, &local_cache);
+ end = str_get_character_boundaries(self, pos + len - 1, &local_cache);
if ((beg.start_offset_in_bytes == -1) ||
(end.end_offset_in_bytes == -1)) {
@@ -867,14 +931,18 @@
str_reset_flags(self);
self->encoding = enc;
+ character_boundaries_cache_t local_cache;
+ reset_character_boundaries_cache(&local_cache);
+
character_boundaries_t first_boundaries =
- str_get_character_boundaries(self, start);
+ str_get_character_boundaries(self, start, &local_cache);
character_boundaries_t last_boundaries;
if (len == 1) {
last_boundaries = first_boundaries;
}
else {
- last_boundaries = str_get_character_boundaries(self, start+len-1);
+ last_boundaries = str_get_character_boundaries(self, start+len-1,
+ &local_cache);
}
if ((first_boundaries.start_offset_in_bytes == -1) ||
@@ -1043,13 +1111,16 @@
return start_index;
}
+ character_boundaries_cache_t local_cache;
+ reset_character_boundaries_cache(&local_cache);
+
long start_offset_in_bytes;
if (start_index == 0) {
start_offset_in_bytes = 0;
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
- start_index);
+ start_index, &local_cache);
if (boundaries.start_offset_in_bytes == -1) {
if (boundaries.end_offset_in_bytes == -1) {
return -1;
@@ -1068,7 +1139,7 @@
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
- end_index);
+ end_index, &local_cache);
if (boundaries.start_offset_in_bytes == -1) {
if (boundaries.end_offset_in_bytes == -1) {
return -1;
@@ -1256,13 +1327,14 @@
}
static VALUE
-rstr_substr(VALUE str, long beg, long len)
+rstr_substr_with_cache(VALUE str, long beg, long len,
+ character_boundaries_cache_t *cache)
{
if (len < 0) {
return Qnil;
}
- const long n = str_length(RSTR(str));
+ const long n = str_length_with_cache(RSTR(str), cache);
if (beg < 0) {
beg += n;
}
@@ -1276,11 +1348,17 @@
len = n - beg;
}
- rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1);
+ rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, cache);
OBJ_INFECT(substr, str);
return substr == NULL ? Qnil : (VALUE)substr;
}
+static VALUE
+rstr_substr(VALUE str, long beg, long len)
+{
+ return rstr_substr_with_cache(str, beg, len, NULL);
+}
+
static void
rstr_splice(VALUE self, long beg, long len, VALUE str)
{
@@ -2961,6 +3039,9 @@
pat = get_pat(pat, true);
const bool tainted = OBJ_TAINTED(self) || OBJ_TAINTED(pat);
+ character_boundaries_cache_t local_cache;
+ reset_character_boundaries_cache(&local_cache);
+
VALUE ary = 0;
if (!block_given) {
ary = rb_ary_new();
@@ -2986,7 +3067,7 @@
VALUE scan_result;
if (count == 1) {
- scan_result = rb_reg_nth_match(0, match);
+ scan_result = rb_reg_nth_match_with_cache(0, match, &local_cache);
if (tainted) {
OBJ_TAINT(scan_result);
}
@@ -2994,7 +3075,8 @@
else {
scan_result = rb_ary_new2(count);
for (int i = 1; i < count; i++) {
- VALUE substr = rb_reg_nth_match(i, match);
+ VALUE substr = rb_reg_nth_match_with_cache(i, match,
+ &local_cache);
if (tainted) {
OBJ_TAINT(tainted);
}
@@ -3226,7 +3308,7 @@
tmp = rb_str_new(NULL, 0);
}
else {
- tmp = rb_str_subseq(str, beg, len - beg);
+ tmp = rb_str_substr(str, beg, len - beg);
}
rb_ary_push(result, tmp);
}
@@ -6451,16 +6533,33 @@
VALUE
rb_str_subseq(VALUE str, long beg, long len)
{
+ assert(IS_RSTR(str) && beg >= 0 && len >= 0
+ && RSTR(str)->length_in_bytes <= len + beg);
+ VALUE subseq;
+ if (len == 0) {
+ subseq = (VALUE)str_new_similar_empty_string(RSTR(str));
+ }
+ else {
+ subseq = (VALUE)str_new_copy_of_part(RSTR(str), beg, len);
+ }
+ OBJ_INFECT(subseq, str);
+ return subseq;
+}
+
+VALUE
+rb_str_substr_with_cache(VALUE str, long beg, long len,
+ character_boundaries_cache_t *cache)
+{
if (!IS_RSTR(str)) {
str = (VALUE)str_need_string(str);
}
- return rstr_substr(str, beg, len);
+ return rstr_substr_with_cache(str, beg, len, cache);
}
VALUE
rb_str_substr(VALUE str, long beg, long len)
{
- return rb_str_subseq(str, beg, len);
+ return rb_str_substr_with_cache(str, beg, len, NULL);
}
void
Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c 2011-01-07 04:13:16 UTC (rev 5129)
+++ MacRuby/trunk/ucnv.c 2011-01-07 13:24:02 UTC (rev 5130)
@@ -186,15 +186,8 @@
str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode)
{
character_boundaries_t boundaries = {-1, -1};
+ assert(index >= 0);
- if (index < 0) {
- // calculating the length is slow but we don't have much choice
- index += str_ucnv_length(self, ucs2_mode);
- if (index < 0) {
- return boundaries;
- }
- }
-
// the code has many similarities with str_length
USE_CONVERTER(cnv, self->encoding);
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20110107/4e621173/attachment-0001.html>
More information about the macruby-changes
mailing list