[macruby-changes] [5077] MacRuby/trunk

source_changes at macosforge.org source_changes at macosforge.org
Thu Dec 23 02:17:17 PST 2010


Revision: 5077
          http://trac.macosforge.org/projects/ruby/changeset/5077
Author:   vincent.isambart at gmail.com
Date:     2010-12-23 02:17:13 -0800 (Thu, 23 Dec 2010)
Log Message:
-----------
added many UTF-8 specific optimizations

Modified Paths:
--------------
    MacRuby/trunk/encoding.h
    MacRuby/trunk/string.c

Modified: MacRuby/trunk/encoding.h
===================================================================
--- MacRuby/trunk/encoding.h	2010-12-23 07:34:08 UTC (rev 5076)
+++ MacRuby/trunk/encoding.h	2010-12-23 10:17:13 UTC (rev 5077)
@@ -184,13 +184,6 @@
 }
 
 static inline bool
-str_known_to_have_an_invalid_encoding(rb_str_t *self)
-{
-    return (self->flags & (STRING_VALID_ENCODING_SET
-		| STRING_VALID_ENCODING)) == STRING_VALID_ENCODING_SET;
-}
-
-static inline bool
 str_check_flag_and_update_if_needed(rb_str_t *self, str_flag_t flag_set,
 	str_flag_t flag)
 {

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2010-12-23 07:34:08 UTC (rev 5076)
+++ MacRuby/trunk/string.c	2010-12-23 10:17:13 UTC (rev 5077)
@@ -129,6 +129,26 @@
 	}
 	str_set_ascii_only(self, ascii_only);
     }
+    else if (IS_UTF8_ENC(self->encoding)) {
+	bool ascii_only = true;
+	bool valid_encoding = true;
+
+	for (int i = 0; i < self->length_in_bytes; ) {
+	    UChar32 c;
+	    U8_NEXT(self->bytes, i, self->length_in_bytes, c);
+	    if (c == U_SENTINEL) {
+		valid_encoding = false;
+		ascii_only = false;
+		break;
+	    }
+	    else if (c > 127) {
+		ascii_only = false;
+	    }
+	}
+
+	str_set_valid_encoding(self, valid_encoding);
+	str_set_ascii_only(self, ascii_only);
+    }
     else if (IS_UTF16_ENC(self->encoding)) {
 	str_update_flags_utf16(self);
     }
@@ -357,7 +377,8 @@
 static long
 str_length(rb_str_t *self)
 {
-    if (self->encoding->single_byte_encoding) {
+    if (self->encoding->single_byte_encoding
+	    || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
 	return self->length_in_bytes;
     }
     else if (IS_UTF8_ENC(self->encoding)) {
@@ -531,7 +552,8 @@
 {
     character_boundaries_t boundaries = {-1, -1};
 
-    if (self->encoding->single_byte_encoding) {
+    if (self->encoding->single_byte_encoding
+	    || (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
 	if (index < 0) {
 	    index += self->length_in_bytes;
 	    if (index < 0) {
@@ -541,6 +563,54 @@
 	boundaries.start_offset_in_bytes = index;
 	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 1;
     }
+    else if (IS_UTF8_ENC(self->encoding)) {
+	long pos = 0;
+	int i = 0;
+	if (index < 0) {
+	    index += str_length(self);
+	    if (index < 0) {
+		return boundaries;
+	    }
+	}
+	while (i < self->length_in_bytes) {
+	    UChar32 c;
+	    int old_i = i;
+	    long new_pos = pos;
+	    U8_NEXT(self->bytes, i, self->length_in_bytes, c);
+	    if (c == U_SENTINEL) {
+		new_pos += i - old_i;
+		if (new_pos > index) {
+		    boundaries.start_offset_in_bytes =
+			old_i + (index - pos);
+		    boundaries.end_offset_in_bytes =
+			boundaries.start_offset_in_bytes + 1;
+		    return boundaries;
+		}
+	    }
+	    else if (U_IS_BMP(c)) {
+		new_pos++;
+		if (new_pos > index) {
+		    boundaries.start_offset_in_bytes = old_i;
+		    boundaries.end_offset_in_bytes = i;
+		    return boundaries;
+		}
+	    }
+	    else {
+		new_pos += 2;
+		if (new_pos > index) {
+		    if (index == pos) {
+			boundaries.start_offset_in_bytes = old_i;
+		    }
+		    else {
+			assert(index == pos + 1);
+			boundaries.end_offset_in_bytes = i;
+		    }
+		    return boundaries;
+		}
+	    }
+	    pos = new_pos;
+	}
+    }
     else if (IS_UTF16_ENC(self->encoding)) {
 	if (index < 0) {
 	    index += div_round_up(self->length_in_bytes, 2);
@@ -1016,10 +1086,16 @@
     if (range_length_in_uchars <= 0) {
 	return;
     }
-    if (IS_NATIVE_UTF16_ENC(self->encoding)) {
+    if (self->encoding->ascii_compatible && str_is_ascii_only(self)) {
+	char *source_bytes = &self->bytes[range_start_offset_in_uchars];
+	for (long i = 0; i < range_length_in_uchars; ++i) {
+	    buffer[i] = source_bytes[i];
+	}
+    }
+    else if (IS_NATIVE_UTF16_ENC(self->encoding)) {
 	memcpy(buffer,
-		&self->bytes[BYTES_TO_UCHARS(range_start_offset_in_uchars)],
-		BYTES_TO_UCHARS(range_length_in_uchars));
+		&self->bytes[UCHARS_TO_BYTES(range_start_offset_in_uchars)],
+		UCHARS_TO_BYTES(range_length_in_uchars));
     }
     else {
 	__block long pos_in_src = 0;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20101223/4e4c43b1/attachment-0001.html>


More information about the macruby-changes mailing list