[macruby-changes] [5054] MacRuby/trunk

source_changes at macosforge.org source_changes at macosforge.org
Sat Dec 18 22:13:14 PST 2010


Revision: 5054
          http://trac.macosforge.org/projects/ruby/changeset/5054
Author:   vincent.isambart at gmail.com
Date:     2010-12-18 22:13:10 -0800 (Sat, 18 Dec 2010)
Log Message:
-----------
changed the internal representation of strings

The strings could have 2 internal representations (UTF-16 or binary),
there is now only the binary one. It makes a few things harder, but
other things much simpler.

The main reason for doing it is that we could have problems in
multi-threaded applications, when multiple threads are using the same
string at the same time, even without using any operation modifiying the
string (as some operations were prefering using the string in UTF-16 and
others in binary mode).

Modified Paths:
--------------
    MacRuby/trunk/encoding.c
    MacRuby/trunk/encoding.h
    MacRuby/trunk/encoding_ucnv.h
    MacRuby/trunk/string.c
    MacRuby/trunk/transcode.c
    MacRuby/trunk/ucnv.c

Modified: MacRuby/trunk/encoding.c
===================================================================
--- MacRuby/trunk/encoding.c	2010-12-19 03:10:47 UTC (rev 5053)
+++ MacRuby/trunk/encoding.c	2010-12-19 06:13:10 UTC (rev 5054)
@@ -225,6 +225,7 @@
 	bool single_byte_encoding, // in the encoding a character takes only
 				   // one byte
 	bool ascii_compatible, // is the encoding ASCII compatible or not
+	bool little_endian, // for UTF-16/32, if the encoding is little endian
 	... // aliases for the encoding (should no include the public name)
 	    // - must end with a NULL
 	)
@@ -234,14 +235,14 @@
     // create an array for the aliases
     unsigned int aliases_count = 0;
     va_list va_aliases;
-    va_start(va_aliases, ascii_compatible);
+    va_start(va_aliases, little_endian);
     while (va_arg(va_aliases, const char *) != NULL) {
 	++aliases_count;
     }
     va_end(va_aliases);
     const char **aliases = (const char **)
 	malloc(sizeof(const char *) * aliases_count);
-    va_start(va_aliases, ascii_compatible);
+    va_start(va_aliases, little_endian);
     for (unsigned int i = 0; i < aliases_count; ++i) {
 	aliases[i] = va_arg(va_aliases, const char *);
     }
@@ -260,6 +261,7 @@
     encoding->min_char_size = min_char_size;
     encoding->single_byte_encoding = single_byte_encoding;
     encoding->ascii_compatible = ascii_compatible;
+    encoding->little_endian = little_endian;
     encoding->aliases_count = aliases_count;
     encoding->aliases = aliases;
 
@@ -279,20 +281,20 @@
 void
 Init_PreEncoding(void)
 {
-    add_encoding(ENCODING_BINARY,      ENCODING_TYPE_SPECIAL, "ASCII-8BIT",  1, true,  true,  "BINARY", NULL);
-    add_encoding(ENCODING_ASCII,       ENCODING_TYPE_UCNV,    "US-ASCII",    1, true,  true,  "ASCII", "ANSI_X3.4-1968", "646", NULL);
-    add_encoding(ENCODING_UTF8,        ENCODING_TYPE_UCNV,    "UTF-8",       1, false, true,  "CP65001", "locale", NULL);
-    add_encoding(ENCODING_UTF16BE,     ENCODING_TYPE_UCNV,    "UTF-16BE",    2, false, false, NULL);
-    add_encoding(ENCODING_UTF16LE,     ENCODING_TYPE_UCNV,    "UTF-16LE",    2, false, false, NULL);
-    add_encoding(ENCODING_UTF32BE,     ENCODING_TYPE_UCNV,    "UTF-32BE",    4, false, false, "UCS-4BE", NULL);
-    add_encoding(ENCODING_UTF32LE,     ENCODING_TYPE_UCNV,    "UTF-32LE",    4, false, false, "UCS-4LE", NULL);
-    add_encoding(ENCODING_ISO8859_1,   ENCODING_TYPE_UCNV,    "ISO-8859-1",  1, true,  true,  "ISO8859-1", NULL);
-    add_encoding(ENCODING_MACROMAN,    ENCODING_TYPE_UCNV,    "macRoman",    1, true,  true,  NULL);
-    add_encoding(ENCODING_MACCYRILLIC, ENCODING_TYPE_UCNV,    "macCyrillic", 1, true,  true,  NULL);
-    add_encoding(ENCODING_BIG5,        ENCODING_TYPE_UCNV,    "Big5",        1, false, true,  "CP950", NULL);
+    add_encoding(ENCODING_BINARY,      ENCODING_TYPE_SPECIAL, "ASCII-8BIT",  1, true,  true,  false, "BINARY", NULL);
+    add_encoding(ENCODING_ASCII,       ENCODING_TYPE_UCNV,    "US-ASCII",    1, true,  true,  false, "ASCII", "ANSI_X3.4-1968", "646", NULL);
+    add_encoding(ENCODING_UTF8,        ENCODING_TYPE_UCNV,    "UTF-8",       1, false, true,  false, "CP65001", "locale", NULL);
+    add_encoding(ENCODING_UTF16BE,     ENCODING_TYPE_UCNV,    "UTF-16BE",    2, false, false, false, NULL);
+    add_encoding(ENCODING_UTF16LE,     ENCODING_TYPE_UCNV,    "UTF-16LE",    2, false, false, true,  NULL);
+    add_encoding(ENCODING_UTF32BE,     ENCODING_TYPE_UCNV,    "UTF-32BE",    4, false, false, false, "UCS-4BE", NULL);
+    add_encoding(ENCODING_UTF32LE,     ENCODING_TYPE_UCNV,    "UTF-32LE",    4, false, false, true,  "UCS-4LE", NULL);
+    add_encoding(ENCODING_ISO8859_1,   ENCODING_TYPE_UCNV,    "ISO-8859-1",  1, true,  true,  false, "ISO8859-1", NULL);
+    add_encoding(ENCODING_MACROMAN,    ENCODING_TYPE_UCNV,    "macRoman",    1, true,  true,  false, NULL);
+    add_encoding(ENCODING_MACCYRILLIC, ENCODING_TYPE_UCNV,    "macCyrillic", 1, true,  true,  false, NULL);
+    add_encoding(ENCODING_BIG5,        ENCODING_TYPE_UCNV,    "Big5",        1, false, true,  false, "CP950", NULL);
     // FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables
-    add_encoding(ENCODING_EUCJP,       ENCODING_TYPE_UCNV,    "EUC-JP",      1, false, true,  "eucJP", NULL);
-    add_encoding(ENCODING_SJIS,        ENCODING_TYPE_UCNV,    "Shift_JIS",   1, false, true,  "SJIS", NULL);
+    add_encoding(ENCODING_EUCJP,       ENCODING_TYPE_UCNV,    "EUC-JP",      1, false, true,  false, "eucJP", NULL);
+    add_encoding(ENCODING_SJIS,        ENCODING_TYPE_UCNV,    "Shift_JIS",   1, false, true,  false, "SJIS", NULL);
     //add_encoding(ENCODING_EUCJP,     ENCODING_TYPE_RUBY, "EUC-JP",      1, false, true,  "eucJP", NULL);
     //add_encoding(ENCODING_SJIS,      ENCODING_TYPE_RUBY, "Shift_JIS",   1, false, true, "SJIS", NULL);
     //add_encoding(ENCODING_CP932,     ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL);

Modified: MacRuby/trunk/encoding.h
===================================================================
--- MacRuby/trunk/encoding.h	2010-12-19 03:10:47 UTC (rev 5053)
+++ MacRuby/trunk/encoding.h	2010-12-19 06:13:10 UTC (rev 5054)
@@ -34,19 +34,21 @@
 #define ENCODING_UTF32_NON_NATIVE ENCODING_UTF32LE
 #endif
 
-#define NATIVE_UTF16_ENC(encoding) \
+#define IS_NATIVE_UTF16_ENC(encoding) \
     ((encoding) == rb_encodings[ENCODING_UTF16_NATIVE])
-#define NON_NATIVE_UTF16_ENC(encoding) \
+#define IS_NON_NATIVE_UTF16_ENC(encoding) \
     ((encoding) == rb_encodings[ENCODING_UTF16_NON_NATIVE])
-#define UTF16_ENC(encoding) \
-    (NATIVE_UTF16_ENC(encoding) || NON_NATIVE_UTF16_ENC(encoding))
-#define NATIVE_UTF32_ENC(encoding) \
+#define IS_UTF16_ENC(encoding) \
+    (IS_NATIVE_UTF16_ENC(encoding) || IS_NON_NATIVE_UTF16_ENC(encoding))
+#define IS_NATIVE_UTF32_ENC(encoding) \
     ((encoding) == rb_encodings[ENCODING_UTF32_NATIVE])
-#define NON_NATIVE_UTF32_ENC(encoding) \
+#define IS_NON_NATIVE_UTF32_ENC(encoding) \
     ((encoding) == rb_encodings[ENCODING_UTF32_NON_NATIVE])
-#define UTF32_ENC(encoding) \
-    (NATIVE_UTF32_ENC(encoding) || NON_NATIVE_UTF32_ENC(encoding))
-#define BINARY_ENC(encoding) ((encoding) == rb_encodings[ENCODING_BINARY])
+#define IS_UTF32_ENC(encoding) \
+    (IS_NATIVE_UTF32_ENC(encoding) || IS_NON_NATIVE_UTF32_ENC(encoding))
+#define IS_UTF8_ENC(encoding) ((encoding) == rb_encodings[ENCODING_UTF8])
+#define IS_ASCII_ENC(encoding) ((encoding) == rb_encodings[ENCODING_ASCII])
+#define IS_BINARY_ENC(encoding) ((encoding) == rb_encodings[ENCODING_BINARY])
 
 typedef uint8_t str_flag_t;
 
@@ -55,10 +57,7 @@
     struct rb_encoding *encoding;
     long capacity_in_bytes;
     long length_in_bytes;
-    union {
-	char *bytes;
-	UChar *uchars;
-    } data;
+    char *bytes;
     str_flag_t flags;
 } rb_str_t;
 
@@ -119,6 +118,7 @@
     unsigned char min_char_size;
     bool single_byte_encoding : 1;
     bool ascii_compatible : 1;
+    bool little_endian : 1; // only meaningful for UTF-16 or UTF-32
     void *private_data;
 } rb_encoding_t;
 
@@ -145,16 +145,11 @@
 
 extern rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
 
-#define STRING_HAS_SUPPLEMENTARY     0x020
-#define STRING_HAS_SUPPLEMENTARY_SET 0x010
 #define STRING_ASCII_ONLY_SET        0x010
 #define STRING_ASCII_ONLY            0x008
 #define STRING_VALID_ENCODING_SET    0x004
 #define STRING_VALID_ENCODING        0x002
-#define STRING_STORED_IN_UCHARS      0x001
 
-#define STRING_REQUIRED_FLAGS STRING_STORED_IN_UCHARS
-
 #define BYTES_TO_UCHARS(len) ((len) / sizeof(UChar))
 #define UCHARS_TO_BYTES(len) ((len) * sizeof(UChar))
 
@@ -169,10 +164,9 @@
 void str_update_flags(rb_str_t *self);
 
 static inline void
-str_unset_facultative_flags(rb_str_t *self)
+str_reset_flags(rb_str_t *self)
 {
-    self->flags &= ~STRING_HAS_SUPPLEMENTARY_SET & ~STRING_ASCII_ONLY_SET
-	& ~STRING_VALID_ENCODING_SET;
+    self->flags = 0;
 }
 
 static inline bool
@@ -183,13 +177,6 @@
 }
 
 static inline bool
-str_known_not_to_have_any_supplementary(rb_str_t *self)
-{
-    return (self->flags & (STRING_HAS_SUPPLEMENTARY_SET
-		| STRING_HAS_SUPPLEMENTARY)) == STRING_HAS_SUPPLEMENTARY_SET;
-}
-
-static inline bool
 str_check_flag_and_update_if_needed(rb_str_t *self, str_flag_t flag_set,
 	str_flag_t flag)
 {
@@ -226,31 +213,8 @@
     return str_is_ascii_only(self);
 }
 
-static inline bool
-str_is_stored_in_uchars(rb_str_t *self)
-{
-    return self->flags & STRING_STORED_IN_UCHARS;
-}
-
 static inline void
-str_negate_stored_in_uchars(rb_str_t *self)
-{
-    self->flags ^= STRING_STORED_IN_UCHARS;
-}
-
-static inline void
-str_set_stored_in_uchars(rb_str_t *self, bool status)
-{
-    if (status) {
-	self->flags |= STRING_STORED_IN_UCHARS;
-    }
-    else {
-	self->flags &= ~STRING_STORED_IN_UCHARS;
-    }
-}
-
-static inline void
-str_set_facultative_flag(rb_str_t *self, bool status, str_flag_t flag_set,
+str_set_flag(rb_str_t *self, bool status, str_flag_t flag_set,
 	str_flag_t flag)
 {
     if (status) {
@@ -262,23 +226,15 @@
 }
 
 static inline void
-str_set_has_supplementary(rb_str_t *self, bool status)
-{
-    str_set_facultative_flag(self, status, STRING_HAS_SUPPLEMENTARY_SET,
-	    STRING_HAS_SUPPLEMENTARY);
-}
-
-static inline void
 str_set_ascii_only(rb_str_t *self, bool status)
 {
-    str_set_facultative_flag(self, status, STRING_ASCII_ONLY_SET,
-	    STRING_ASCII_ONLY);
+    str_set_flag(self, status, STRING_ASCII_ONLY_SET, STRING_ASCII_ONLY);
 }
 
 static inline void
 str_set_valid_encoding(rb_str_t *self, bool status)
 {
-    str_set_facultative_flag(self, status, STRING_VALID_ENCODING_SET,
+    str_set_flag(self, status, STRING_VALID_ENCODING_SET,
 	    STRING_VALID_ENCODING);
 }
 

Modified: MacRuby/trunk/encoding_ucnv.h
===================================================================
--- MacRuby/trunk/encoding_ucnv.h	2010-12-19 03:10:47 UTC (rev 5053)
+++ MacRuby/trunk/encoding_ucnv.h	2010-12-19 06:13:10 UTC (rev 5054)
@@ -18,18 +18,15 @@
 extern "C" {
 #endif
 
-typedef void (^each_char_callback_t)(UChar32 c, const char* character_start, long character_length, bool *stop);
+typedef void (^each_uchar32_callback_t)(UChar32 c, long start_index, long length, bool *stop);
 
 void str_ucnv_update_flags(rb_str_t *self);
-void str_ucnv_make_data_binary(rb_str_t *self);
-bool str_ucnv_try_making_data_uchars(rb_str_t *self);
 long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
-long str_ucnv_bytesize(rb_str_t *self);
 character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
 long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
 void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
 void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
-void str_ucnv_each_char(rb_str_t *self, each_char_callback_t callback);
+void str_ucnv_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback);
 
 #if defined(__cplusplus)
 } // extern "C"

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2010-12-19 03:10:47 UTC (rev 5053)
+++ MacRuby/trunk/string.c	2010-12-19 06:13:10 UTC (rev 5054)
@@ -37,20 +37,18 @@
 static void
 str_update_flags_utf16(rb_str_t *self)
 {
-    assert(str_is_stored_in_uchars(self)
-	    || NON_NATIVE_UTF16_ENC(self->encoding));
+    assert(IS_UTF16_ENC(self->encoding));
 
     bool ascii_only = true;
-    bool has_supplementary = false;
     bool valid_encoding = true;
     // if the length is an odd number, it can't be valid UTF-16
     if (ODD_NUMBER(self->length_in_bytes)) {
 	valid_encoding = false;
     }
 
-    UChar *uchars = self->data.uchars;
+    UChar *uchars = (UChar *)self->bytes;
     long uchars_count = BYTES_TO_UCHARS(self->length_in_bytes);
-    bool native_byte_order = str_is_stored_in_uchars(self);
+    bool native_byte_order = IS_NATIVE_UTF16_ENC(self->encoding);
     UChar32 lead = 0;
     for (int i = 0; i < uchars_count; ++i) {
 	UChar32 c;
@@ -76,7 +74,6 @@
 		    valid_encoding = false;
 		}
 		else {
-		    has_supplementary = true;
 		    c = U16_GET_SUPPLEMENTARY(lead, c);
 		    if (!U_IS_UNICODE_CHAR(c)) {
 			valid_encoding = false;
@@ -104,7 +101,6 @@
 	valid_encoding = false;
     }
 
-    str_set_has_supplementary(self, has_supplementary);
     if (valid_encoding) {
 	str_set_valid_encoding(self, true);
 	str_set_ascii_only(self, ascii_only);
@@ -121,21 +117,19 @@
     if (self->length_in_bytes == 0) {
 	str_set_valid_encoding(self, true);
 	str_set_ascii_only(self, true);
-	str_set_has_supplementary(self, false);
     }
-    else if (BINARY_ENC(self->encoding)) {
+    else if (IS_BINARY_ENC(self->encoding)) {
 	str_set_valid_encoding(self, true);
-	str_set_has_supplementary(self, false);
 	bool ascii_only = true;
 	for (long i = 0; i < self->length_in_bytes; ++i) {
-	    if ((uint8_t)self->data.bytes[i] > 127) {
+	    if ((uint8_t)self->bytes[i] > 127) {
 		ascii_only = false;
 		break;
 	    }
 	}
 	str_set_ascii_only(self, ascii_only);
     }
-    else if (str_is_stored_in_uchars(self) || UTF16_ENC(self->encoding)) {
+    else if (IS_UTF16_ENC(self->encoding)) {
 	str_update_flags_utf16(self);
     }
     else {
@@ -143,26 +137,6 @@
     }
 }
 
-static void
-str_invert_byte_order(rb_str_t *self)
-{
-    assert(NON_NATIVE_UTF16_ENC(self->encoding));
-
-    long length_in_bytes = self->length_in_bytes;
-    char *bytes = self->data.bytes;
-
-    if (ODD_NUMBER(length_in_bytes)) {
-	--length_in_bytes;
-    }
-
-    for (long i = 0; i < length_in_bytes; i += 2) {
-	char tmp = bytes[i];
-	bytes[i] = bytes[i+1];
-	bytes[i+1] = tmp;
-    }
-    str_negate_stored_in_uchars(self);
-}
-
 static rb_encoding_t *
 str_compatible_encoding(rb_str_t *str1, rb_str_t *str2)
 {
@@ -223,8 +197,9 @@
     str->encoding = rb_encodings[ENCODING_UTF8];
     str->capacity_in_bytes = 0;
     str->length_in_bytes = 0;
-    str->data.bytes = NULL;
-    str->flags = 0;
+    str->bytes = NULL;
+    str_reset_flags(str);
+
     return str;
 }
 
@@ -247,13 +222,13 @@
     assert(len >= 0);
     assert(enc != NULL);
 
-    self->flags = 0;
+    str_reset_flags(self);
     self->encoding = enc;
     self->capacity_in_bytes = len;
     if (len > 0) {
-	GC_WB(&self->data.bytes, xmalloc(len));
+	GC_WB(&self->bytes, xmalloc(len));
 	if (bytes != NULL) {
-	    memcpy(self->data.bytes, bytes, len);
+	    memcpy(self->bytes, bytes, len);
 	    self->length_in_bytes = len;
 	}
 	else {
@@ -261,7 +236,7 @@
 	}
     }
     else {
-	self->data.bytes = NULL;
+	self->bytes = NULL;
 	self->length_in_bytes = 0;
     }
 }
@@ -272,63 +247,62 @@
     if (self == source) {
 	return;
     }
-    str_replace_with_bytes(self, source->data.bytes, source->length_in_bytes,
+    str_replace_with_bytes(self, source->bytes, source->length_in_bytes,
 	    source->encoding);
     self->flags = source->flags;
 }
 
-static bool str_try_making_data_uchars(rb_str_t *self);
+static void str_resize_bytes(rb_str_t *self, long new_capacity);
+static void str_concat_bytes(rb_str_t *self, const char *bytes, long len);
 
 static void
 str_append_uchar32(rb_str_t *self, UChar32 c)
 {
-    assert(str_try_making_data_uchars(self));
-    const long uchar_cap = BYTES_TO_UCHARS(self->capacity_in_bytes);
-    const long uchar_len = BYTES_TO_UCHARS(self->length_in_bytes);
-    int concat_len = U_IS_BMP(c) ? 1 : 2;
-    if (uchar_len + concat_len >= uchar_cap) {
-	assert(uchar_len + concat_len < uchar_cap + 10);
-	self->capacity_in_bytes += UCHARS_TO_BYTES(10);
-	UChar *uchars = (UChar *)xrealloc(self->data.uchars,
-		self->capacity_in_bytes);
-	if (uchars != self->data.uchars) {
-	    GC_WB(&self->data.uchars, uchars);
+    if ((c <= 127) && self->encoding->ascii_compatible) {
+	str_resize_bytes(self, self->length_in_bytes + 1);
+	self->bytes[self->length_in_bytes] = c;
+	self->length_in_bytes++;
+    }
+    else if (IS_UTF8_ENC(self->encoding)) {
+	long len = U8_LENGTH(c);
+	if (len > 0) {
+	    str_resize_bytes(self, self->length_in_bytes + len);
+	    U8_APPEND_UNSAFE(self->bytes, self->length_in_bytes, c);
+	    self->length_in_bytes += len;
 	}
     }
-    if (U_IS_BMP(c)) {
-	self->data.uchars[uchar_len] = c;
+    else if (IS_NATIVE_UTF32_ENC(self->encoding)) {
+	str_concat_bytes(self, (char *)&c, 4);
     }
+    else if (IS_NATIVE_UTF16_ENC(self->encoding) && U_IS_BMP(c)) {
+	UChar uchar = c;
+	str_concat_bytes(self, (char *)&uchar, 2);
+    }
     else {
-	self->data.uchars[uchar_len] = U16_LEAD(c);
-	self->data.uchars[uchar_len+1] = U16_TRAIL(c);
+	rb_str_t *str = RSTR(rb_enc_str_new((char *)&c, 4,
+		    rb_encodings[ENCODING_UTF32_NATIVE]));
+	str = str_simple_transcode(str, self->encoding);
+	str_concat_bytes(self, str->bytes, str->length_in_bytes);
     }
-    self->length_in_bytes += UCHARS_TO_BYTES(concat_len);
 }
 
+static void str_concat_uchars(rb_str_t *self, const UChar *chars, long len);
 static void
 str_replace_with_uchars(rb_str_t *self, const UChar *chars, long len)
 {
     assert(len >= 0);
 
-    len = UCHARS_TO_BYTES(len);
-    self->flags = 0;
+    str_reset_flags(self);
+    self->length_in_bytes = 0;
     self->encoding = rb_encodings[ENCODING_UTF8];
-    self->capacity_in_bytes = len;
     if (len > 0) {
-	GC_WB(&self->data.uchars, xmalloc(len));
-	if (chars != NULL) {
-	    memcpy(self->data.uchars, chars, len);
-	    self->length_in_bytes = len;
+	if (chars == NULL) {
+	    str_resize_bytes(self, len);
 	}
 	else {
-	    self->length_in_bytes = 0;
+	    str_concat_uchars(self, chars, len);
 	}
-	str_set_stored_in_uchars(self, true);
     }
-    else {
-	self->data.uchars = NULL;
-	self->length_in_bytes = 0;
-    }
 }
 
 static void
@@ -380,132 +354,77 @@
     return destination;
 }
 
-static void
-str_make_data_binary(rb_str_t *self)
+static long
+str_length(rb_str_t *self)
 {
-    if (!str_is_stored_in_uchars(self) || NATIVE_UTF16_ENC(self->encoding)) {
-	// nothing to do
-	return;
+    if (self->encoding->single_byte_encoding) {
+	return self->length_in_bytes;
     }
-
-    if (NON_NATIVE_UTF16_ENC(self->encoding)) {
-	// Doing the conversion ourself is faster, and anyway ICU's converter
-	// does not like non-paired surrogates.
-	str_invert_byte_order(self);
-	return;
-    }
-
-    str_ucnv_make_data_binary(self);
-}
-
-static bool
-str_try_making_data_uchars(rb_str_t *self)
-{
-    if (str_is_stored_in_uchars(self)) {
-	return true;
-    }
-    else if (NATIVE_UTF16_ENC(self->encoding)) {
-	// sometimes the flag might not already be set so set it
-	str_set_stored_in_uchars(self, true);
-	return true;
-    }
-    else if (NON_NATIVE_UTF16_ENC(self->encoding)) {
-	str_invert_byte_order(self);
-	return true;
-    }
-    else if (BINARY_ENC(self->encoding)) {
-	// you can't convert binary to anything
-	return false;
-    }
-    else if (self->length_in_bytes == 0) {
-	// for empty strings, nothing to convert
-	str_set_stored_in_uchars(self, true);
-	return true;
-    }
-    else if (str_known_to_have_an_invalid_encoding(self)) {
-	return false;
-    }
-
-    return str_ucnv_try_making_data_uchars(self);
-}
-
-static void
-str_make_same_format(rb_str_t *str1, rb_str_t *str2)
-{
-    if (str_is_stored_in_uchars(str1) != str_is_stored_in_uchars(str2)) {
-	if (str_is_stored_in_uchars(str1)) {
-	    if (!str_try_making_data_uchars(str2)) {
-		str_make_data_binary(str1);
+    else if (IS_UTF8_ENC(self->encoding)) {
+	long length = 0;
+	int i = 0;
+	while (i < self->length_in_bytes) {
+	    UChar32 c;
+	    int old_i = i;
+	    U8_NEXT(self->bytes, i, self->length_in_bytes, c);
+	    if (c == U_SENTINEL) {
+		length += i - old_i;
 	    }
+	    else if (U_IS_BMP(c)) {
+		length++;
+	    }
+	    else {
+		length += 2;
+	    }
 	}
-	else {
-	    str_make_data_binary(str2);
-	}
+	return length;
     }
-}
-
-static long
-str_length(rb_str_t *self)
-{
-    if (self->length_in_bytes == 0) {
-	return 0;
-    }
-    if (str_try_making_data_uchars(self)) {
+    else if (IS_UTF16_ENC(self->encoding)) {
 	return div_round_up(self->length_in_bytes, 2);
     }
     else {
-	if (self->encoding->single_byte_encoding) {
-	    return self->length_in_bytes;
-	}
-	else if (NON_NATIVE_UTF16_ENC(self->encoding)) {
-	    return div_round_up(self->length_in_bytes, 2);
-	}
-	else {
-	    return str_ucnv_length(self, true);
-	}
+	return str_ucnv_length(self, true);
     }
 }
 
 // Note that each_char iterates on unicode characters
 // With a character not in the BMP the callback will only be called once!
 static void
-str_each_char(rb_str_t *self, each_char_callback_t callback)
+str_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback)
 {
-    if (str_is_stored_in_uchars(self)) {
+    if (IS_BINARY_ENC(self->encoding) || IS_ASCII_ENC(self->encoding)) {
 	bool stop = false;
-	long length = BYTES_TO_UCHARS(self->length_in_bytes);
-	for (long i = 0; i < length;) {
-	    UChar32 c;
-	    long old_i = i;
-	    U16_NEXT(self->data.uchars, i, length, c);
-	    callback(c, (const char *)&self->data.uchars[old_i],
-		    UCHARS_TO_BYTES(old_i-i), &stop);
+	for (long i = 0; i < self->length_in_bytes; ++i) {
+	    UChar32 c = (uint8_t)self->bytes[i];
+	    if (c > 127) {
+		c = U_SENTINEL;
+	    }
+	    callback(c, i, 1, &stop);
 	    if (stop) {
 		return;
 	    }
-	};
+	}
     }
-    else if (BINARY_ENC(self->encoding)
-	    || (self->encoding == rb_encodings[ENCODING_ASCII])) {
-	const uint8_t *pos = (uint8_t*)self->data.bytes;
-	const uint8_t *end = pos + self->length_in_bytes;
+    else if (IS_NATIVE_UTF16_ENC(self->encoding)) {
 	bool stop = false;
-	for (; pos < end; ++pos) {
+	long length = BYTES_TO_UCHARS(self->length_in_bytes);
+	UChar *uchars = (UChar *)self->bytes;
+	for (long i = 0; i < length;) {
 	    UChar32 c;
-	    if (*pos > 127) {
-		c = U_SENTINEL;
-	    }
-	    else {
-		c = *pos;
-	    }
-	    callback(c, (const char *)pos, 1, &stop);
+	    long old_i = i;
+	    U16_NEXT(uchars, i, length, c);
+	    callback(c, UCHARS_TO_BYTES(old_i),
+		    UCHARS_TO_BYTES(i-old_i), &stop);
 	    if (stop) {
 		return;
 	    }
-	}
+	    // in case the length changed
+	    // (it should not happen but never know)
+	    length = BYTES_TO_UCHARS(self->length_in_bytes);
+	};
     }
     else {
-	str_ucnv_each_char(self, callback);
+	str_ucnv_each_uchar32(self, callback);
     }
 }
 
@@ -513,27 +432,45 @@
 str_get_uchar(rb_str_t *self, long pos)
 {
     assert(pos >= 0 && pos < str_length(self));
-    if (str_try_making_data_uchars(self)) {
-	return self->data.uchars[pos];
+
+    if (IS_NATIVE_UTF16_ENC(self->encoding)) {
+	return ((UChar *)self->bytes)[pos];
     }
-    //assert(BINARY_ENC(self->encoding));
-    return self->data.bytes[pos];
+
+    __block UChar return_value = 0;
+    __block long i = 0;
+    str_each_uchar32(self, ^(UChar32 c, long start_index, long char_len, bool *stop) {
+	if (c == U_SENTINEL || U_IS_BMP(c)) {
+	    if (i == pos) {
+		return_value = c;
+		*stop = true;
+	    }
+	    else {
+		++i;
+	    }
+	}
+	else {
+	    if (i == pos) {
+		return_value = U16_LEAD(c);
+		*stop = true;
+	    }
+	    else if (i+1 == pos) {
+		return_value = U16_TRAIL(c);
+		*stop = true;
+	    }
+	    else {
+		i += 2;
+	    }
+	}
+    });
+
+    return return_value;
 }
 
 static long
 str_bytesize(rb_str_t *self)
 {
-    if (str_is_stored_in_uchars(self)) {
-	if (UTF16_ENC(self->encoding)) {
-	    return self->length_in_bytes;
-	}
-	else {
-	    return str_ucnv_bytesize(self);
-	}
-    }
-    else {
-	return self->length_in_bytes;
-    }
+    return self->length_in_bytes;
 }
 
 static rb_str_t *
@@ -541,7 +478,6 @@
 {
     rb_str_t *str = str_alloc(rb_cRubyString);
     str->encoding = self->encoding;
-    str->flags = self->flags & STRING_REQUIRED_FLAGS;
     return str;
 }
 
@@ -552,9 +488,8 @@
     rb_str_t *str = str_alloc(rb_cRubyString);
     str->encoding = self->encoding;
     str->capacity_in_bytes = str->length_in_bytes = length_in_bytes;
-    str->flags = self->flags & STRING_REQUIRED_FLAGS;
-    GC_WB(&str->data.bytes, xmalloc(length_in_bytes));
-    memcpy(str->data.bytes, &self->data.bytes[offset_in_bytes],
+    GC_WB(&str->bytes, xmalloc(length_in_bytes));
+    memcpy(str->bytes, &self->bytes[offset_in_bytes],
 	    length_in_bytes);
     return str;
 }
@@ -573,69 +508,29 @@
 {
     character_boundaries_t boundaries = {-1, -1};
 
-    if (str_is_stored_in_uchars(self)) {
+    if (self->encoding->single_byte_encoding) {
 	if (index < 0) {
-	    index += div_round_up(self->length_in_bytes, 2);
+	    index += self->length_in_bytes;
 	    if (index < 0) {
 		return boundaries;
 	    }
 	}
-	boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
-	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes
-	    + 2;
-	if (!UTF16_ENC(self->encoding)) {
-	    long length = BYTES_TO_UCHARS(self->length_in_bytes);
-	    if ((index < length)
-		    && U16_IS_SURROGATE(self->data.uchars[index])) {
-		if (U16_IS_SURROGATE_LEAD(self->data.uchars[index])) {
-		    boundaries.end_offset_in_bytes = -1;
-		}
-		else { // U16_IS_SURROGATE_TRAIL
-		    boundaries.start_offset_in_bytes = -1;
-		}
-	    }
-	}
+	boundaries.start_offset_in_bytes = index;
+	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 1;
     }
-    else { // data in binary
-	if (self->encoding->single_byte_encoding) {
+    else if (IS_UTF16_ENC(self->encoding)) {
+	if (index < 0) {
+	    index += div_round_up(self->length_in_bytes, 2);
 	    if (index < 0) {
-		index += self->length_in_bytes;
-		if (index < 0) {
-		    return boundaries;
-		}
+		return boundaries;
 	    }
-	    boundaries.start_offset_in_bytes = index;
-	    boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes
-		+ 1;
 	}
-	else if (UTF32_ENC(self->encoding)
-		&& str_known_not_to_have_any_supplementary(self)) {
-	    if (index < 0) {
-		index += div_round_up(self->length_in_bytes, 4);
-		if (index < 0) {
-		    return boundaries;
-		}
-	    }
-	    boundaries.start_offset_in_bytes = index * 4;
-	    boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes
-		+ 4;
-	}
-	else if (NON_NATIVE_UTF16_ENC(self->encoding)) {
-	    if (index < 0) {
-		index += div_round_up(self->length_in_bytes, 2);
-		if (index < 0) {
-		    return boundaries;
-		}
-	    }
-	    boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
-	    boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes
-		+ 2;
-	}
-	else {
-	    boundaries = str_ucnv_get_character_boundaries(self,
-		    index, true);
-	}
+	boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
+	boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 2;
     }
+    else {
+	boundaries = str_ucnv_get_character_boundaries(self, index, true);
+    }
 
     return boundaries;
 }
@@ -651,10 +546,6 @@
 	    return NULL;
 	}
     }
-    if (!self->encoding->single_byte_encoding
-	    && !str_is_stored_in_uchars(self)) {
-	str_try_making_data_uchars(self);
-    }
     character_boundaries_t first_boundaries =
 	str_get_character_boundaries(self, first);
     character_boundaries_t last_boundaries =
@@ -688,13 +579,13 @@
 	rb_raise(rb_eArgError, "negative string size (or size too big)");
     }
     if (self->capacity_in_bytes < new_capacity) {
-	if (self->data.bytes == NULL) {
-	    GC_WB(&self->data.bytes, xmalloc(new_capacity));
+	if (self->bytes == NULL) {
+	    GC_WB(&self->bytes, xmalloc(new_capacity));
 	}
 	else {
-	    char *bytes = xrealloc(self->data.bytes, new_capacity);
-	    if (bytes != self->data.bytes) {
-		GC_WB(&self->data.bytes, bytes);
+	    char *bytes = xrealloc(self->bytes, new_capacity);
+	    if (bytes != self->bytes) {
+		GC_WB(&self->bytes, bytes);
 	    }
 	}
 	self->capacity_in_bytes = new_capacity;
@@ -704,13 +595,11 @@
 static void
 str_ensure_null_terminator(rb_str_t *self)
 {
-    assert(!str_is_stored_in_uchars(self) || NATIVE_UTF16_ENC(self->encoding));
-
     if (self->length_in_bytes > 0
 	&& (self->capacity_in_bytes == self->length_in_bytes
-	    || self->data.bytes[self->length_in_bytes] != '\0')) {
+	    || self->bytes[self->length_in_bytes] != '\0')) {
 	str_resize_bytes(self, self->length_in_bytes + 1);
-	self->data.bytes[self->length_in_bytes] = '\0';
+	self->bytes[self->length_in_bytes] = '\0';
     }
 }
 
@@ -722,7 +611,6 @@
 
     if (str != NULL) {
 	str_must_have_compatible_encoding(self, str);
-	str_make_same_format(self, str);
     }
 
     character_boundaries_t beg, end;
@@ -766,18 +654,18 @@
 	    && end.end_offset_in_bytes == self->length_in_bytes) {
     	if (bytes_to_add > 0) {
 	    // We are splicing at the very end.
-	    memcpy(self->data.bytes + self->length_in_bytes, str->data.bytes,
+	    memcpy(self->bytes + self->length_in_bytes, str->bytes,
 		    bytes_to_add);
 	}
     }
     else {
 	// We are splicing in the middle.
-	memmove(self->data.bytes + beg.start_offset_in_bytes + bytes_to_add,
-		self->data.bytes + end.end_offset_in_bytes,
+	memmove(self->bytes + beg.start_offset_in_bytes + bytes_to_add,
+		self->bytes + end.end_offset_in_bytes,
 		self->length_in_bytes - end.end_offset_in_bytes);
 	if (bytes_to_add > 0) {
-	    memcpy(self->data.bytes + beg.start_offset_in_bytes,
-		    str->data.bytes, bytes_to_add);
+	    memcpy(self->bytes + beg.start_offset_in_bytes,
+		    str->bytes, bytes_to_add);
 	}
     }
 
@@ -804,28 +692,57 @@
 
     const long new_length_in_bytes = self->length_in_bytes + len;
 
+    str_reset_flags(self);
     str_resize_bytes(self, new_length_in_bytes);
-    memcpy(self->data.bytes + self->length_in_bytes, bytes, len);
+    memcpy(self->bytes + self->length_in_bytes, bytes, len);
     self->length_in_bytes = new_length_in_bytes;
 }
 
 static void
 str_concat_uchars(rb_str_t *self, const UChar *chars, long len)
 {
-    if (str_try_making_data_uchars(self)) {
-	str_concat_bytes(self, (const char *)chars, UCHARS_TO_BYTES(len)); 
+    if (len == 0) {
+	return;
     }
-    else {
-	assert(BINARY_ENC(RSTR(self)->encoding));
+    str_reset_flags(self);
+    if (IS_UTF8_ENC(self->encoding)) {
+	long new_length_in_bytes = self->length_in_bytes;
+	for (long i = 0; i < len; ) {
+	    UChar32 c;
+	    U16_NEXT(chars, i, len, c);
+	    new_length_in_bytes += U8_LENGTH(c);
+	}
+	str_resize_bytes(self, new_length_in_bytes);
+	for (long pos_in_src = 0, pos_in_dst = self->length_in_bytes;
+		pos_in_src < len; ) {
+	    UChar32 c;
+	    UBool is_error;
+	    U16_NEXT(chars, pos_in_src, len, c);
+	    U8_APPEND((uint8_t *)self->bytes, pos_in_dst,
+		    new_length_in_bytes, c, is_error);
+	}
+	self->length_in_bytes = new_length_in_bytes;
+    }
+    else if (IS_NATIVE_UTF16_ENC(self->encoding)) {
+	str_concat_bytes(self, (char *)chars, UCHARS_TO_BYTES(len));
+    }
+    else if (IS_BINARY_ENC(self->encoding) || IS_ASCII_ENC(self->encoding)) {
 	const long new_length_in_bytes = RSTR(self)->length_in_bytes + len;
 
 	str_resize_bytes(self, new_length_in_bytes);
-	char *ptr = (RSTR(self)->data.bytes + RSTR(self)->length_in_bytes);
+	char *ptr = (RSTR(self)->bytes + RSTR(self)->length_in_bytes);
 	for (int i = 0; i < len; ++i) {
 	    ptr[i] = chars[i];
 	}
 	self->length_in_bytes = new_length_in_bytes;
     }
+    else {
+	rb_str_t *str = RSTR(rb_enc_str_new((char *)chars,
+		    UCHARS_TO_BYTES(len),
+		    rb_encodings[ENCODING_UTF16_NATIVE]));
+	str = str_simple_transcode(str, self->encoding);
+	str_concat_bytes(self, str->bytes, str->length_in_bytes);
+    }
 }
 
 static void
@@ -836,21 +753,9 @@
     }
 
     rb_encoding_t *enc = str_must_have_compatible_encoding(self, str);
-    str_make_same_format(self, str);
-
-    // TODO: we should maybe merge flags
-    // (if both are ASCII-only, the concatenation is ASCII-only,
-    //  though I'm not sure all the tests required are worth doing)
-    str_unset_facultative_flags(self);
-
-    str_concat_bytes(self, str->data.bytes, str->length_in_bytes);
-
-    if (enc != self->encoding) {
-	self->encoding = enc;
-	if (NATIVE_UTF16_ENC(enc)) {
-	    str_set_stored_in_uchars(self, true);
-	}
-    }
+    self->encoding = enc;
+    str_reset_flags(self);
+    str_concat_bytes(self, str->bytes, str->length_in_bytes);
 }
 
 static int
@@ -870,12 +775,10 @@
 	return -1;
     }
 
-    str_make_same_format(self, str);
-
     const long min_len = self->length_in_bytes < str->length_in_bytes
 	? self->length_in_bytes : str->length_in_bytes;
 
-    const int res = memcmp(self->data.bytes, str->data.bytes, min_len);
+    const int res = memcmp(self->bytes, str->bytes, min_len);
 
     if (res == 0) {
 	if (self->length_in_bytes == str->length_in_bytes) {
@@ -910,37 +813,20 @@
 	return -1;
     }
 
-    str_make_same_format(self, str);
-
     const long min_length = self->length_in_bytes < str->length_in_bytes
 	? self->length_in_bytes : str->length_in_bytes;
 
-    if (str_is_stored_in_uchars(str)) {
-	for (long i = 0; i < BYTES_TO_UCHARS(min_length); i++) {
-	    UChar c1 = self->data.uchars[i];
-	    UChar c2 = str->data.uchars[i];
+    for (long i = 0; i < min_length; i++) {
+	char c1 = self->bytes[i];
+	char c2 = str->bytes[i];
+	if (c1 != c2) {
+	    c1 = isascii(c1) ? toupper(c1) : c1;
+	    c2 = isascii(c2) ? toupper(c2) : c2;
 	    if (c1 != c2) {
-		c1 = isascii(c1) ? toupper(c1) : c1;
-		c2 = isascii(c2) ? toupper(c2) : c2;
-		if (c1 != c2) {
-		    return c1 < c2 ? -1 : 1;
-		}
+		return c1 < c2 ? -1 : 1;
 	    }
 	}
     }
-    else {
-	for (long i = 0; i < min_length; i++) {
-	    char c1 = self->data.bytes[i];
-	    char c2 = str->data.bytes[i];
-	    if (c1 != c2) {
-		c1 = isascii(c1) ? toupper(c1) : c1;
-		c2 = isascii(c2) ? toupper(c2) : c2;
-		if (c1 != c2) {
-		    return c1 < c2 ? -1 : 1;
-		}
-	    }
-	}
-    }
 
     if (self->length_in_bytes == str->length_in_bytes) {
 	return 0;
@@ -959,24 +845,14 @@
 	return 0;
     }
 
-    if (str_is_stored_in_uchars(self)) {
+    if (self->encoding->single_byte_encoding) {
+	return offset_in_bytes;
+    }
+    else if (IS_UTF16_ENC(self->encoding)) {
 	return BYTES_TO_UCHARS(offset_in_bytes);
     }
     else {
-	if (self->encoding->single_byte_encoding) {
-	    return offset_in_bytes;
-	}
-	else if (UTF32_ENC(self->encoding)
-		&& str_known_not_to_have_any_supplementary(self)) {
-	    return offset_in_bytes / 4;
-	}
-	else if (NON_NATIVE_UTF16_ENC(self->encoding)) {
-	    return BYTES_TO_UCHARS(offset_in_bytes);
-	}
-	else {
-	    return str_ucnv_offset_in_bytes_to_index(self,
-		    offset_in_bytes, true);
-	}
+	return str_ucnv_offset_in_bytes_to_index(self, offset_in_bytes, true);
     }
 }
 
@@ -995,24 +871,17 @@
 	return backward_search ? end_offset_in_bytes : start_offset_in_bytes;
     }
     str_must_have_compatible_encoding(self, searched);
-    str_make_same_format(self, searched);
     if (searched->length_in_bytes > self->length_in_bytes) {
 	return -1;
     }
 
-    long increment;
-    if (str_is_stored_in_uchars(self)) {
-	increment = 2;
-    }
-    else {
-	increment = self->encoding->min_char_size;
-    }
+    long increment = self->encoding->min_char_size;
 
     if (backward_search) {
 	for (long offset = end_offset_in_bytes - increment;
 		offset >= start_offset_in_bytes;
 		offset -= increment) {
-	    if (memcmp(self->data.bytes + offset, searched->data.bytes,
+	    if (memcmp(self->bytes + offset, searched->bytes,
 			searched->length_in_bytes) == 0) {
 		return offset;
 	    }
@@ -1025,7 +894,7 @@
 	for (long offset = start_offset_in_bytes;
 		offset < max_offset_in_bytes;
 		offset += increment) {
-	    if (memcmp(self->data.bytes + offset, searched->data.bytes,
+	    if (memcmp(self->bytes + offset, searched->bytes,
 			searched->length_in_bytes) == 0) {
 		return offset;
 	    }
@@ -1039,7 +908,6 @@
 	long end_index, bool backward_search)
 {
     str_must_have_compatible_encoding(self, searched);
-    str_make_same_format(self, searched);
 
     if (searched->length_in_bytes == 0 && self->length_in_bytes == 0) {
 	return start_index;
@@ -1129,20 +997,29 @@
     bool need_free = false;
 
     if (IS_RSTR(str)) {
-	if (str_try_making_data_uchars(RSTR(str))) {
-	    chars = RSTR(str)->data.uchars;
-	    chars_len = str_length(RSTR(str));
-	}
-	else {
-	    //assert(BINARY_ENC(RSTR(str)->encoding));
-	    chars_len = RSTR(str)->length_in_bytes;
-	    if (chars_len > 0) {
-		chars = (UChar *)malloc(sizeof(UChar) * chars_len);
-		for (long i = 0; i < chars_len; i++) {
-		    chars[i] = RSTR(str)->data.bytes[i];
+	chars_len = str_length(RSTR(str));
+	if (chars_len > 0) {
+	    chars = (UChar *)malloc(sizeof(UChar) * chars_len);
+	    __block long pos = 0;
+	    str_each_uchar32(RSTR(str), ^(UChar32 c, long start_index, long char_len, bool *stop) {
+		if (c == U_SENTINEL) {
+		    if (char_len == 1) {
+			chars[pos++] = RSTR(str)->bytes[start_index];
+		    }
+		    else {
+			abort(); // TODO
+		    }
 		}
-		need_free = true;
-	    }
+		else if (U_IS_BMP(c)) {
+		    chars[pos++] = c;
+		}
+		else {
+		    chars[pos++] = U16_LEAD(c);
+		    chars[pos++] = U16_TRAIL(c);
+		}
+	    });
+	    assert(pos == chars_len);
+	    need_free = true;
 	}
     }
     else {
@@ -1232,7 +1109,6 @@
 static void inline
 str_concat_ascii_cstr(rb_str_t *self, char *cstr)
 {
-    str_make_data_binary(self);
     long len = strlen(cstr);
     if (self->encoding->ascii_compatible) {
 	str_concat_bytes(self, cstr, len);
@@ -1240,7 +1116,7 @@
     else {
 	rb_str_t *str = RSTR(rb_enc_str_new(cstr, len, rb_encodings[ENCODING_ASCII]));
 	str = str_simple_transcode(str, self->encoding);
-	str_concat_bytes(self, str->data.bytes, str->length_in_bytes);
+	str_concat_bytes(self, str->bytes, str->length_in_bytes);
     }
 }
 
@@ -1263,25 +1139,15 @@
 	return dst_str;
     }
 
-    if (src_encoding == self->encoding) {
-	// if the string can already be converted in UTF-16, half the job is done
-	str_try_making_data_uchars(self);
-    }
-    else {
-	// if the source encoding is not the string encoding
-	// we must be sure to start from the bytes, not UTF-16
-	str_make_data_binary(self);
-    }
-
     rb_encoding_t *src_encoding_used;
     rb_encoding_t *dst_encoding_used;
-    if (BINARY_ENC(dst_encoding)) {
+    if (IS_BINARY_ENC(dst_encoding)) {
 	dst_encoding_used = rb_encodings[ENCODING_ASCII];
     }
     else {
 	dst_encoding_used = dst_encoding;
     }
-    if (BINARY_ENC(src_encoding)) {
+    if (IS_BINARY_ENC(src_encoding)) {
 	src_encoding_used = rb_encodings[ENCODING_ASCII];
     }
     else {
@@ -1296,17 +1162,10 @@
     for (;;) {
 	UChar *utf16;
 	long utf16_length;
-	// if the encoding is native UTF-16 it's always stored in UChars
-	// but it can contain invalid bytes
-	if (str_is_stored_in_uchars(self) && !NATIVE_UTF16_ENC(self->encoding)) {
-	    utf16 = self->data.uchars;
-	    utf16_length = BYTES_TO_UCHARS(self->length_in_bytes);
-	    pos_in_src = self->length_in_bytes;
-	}
-	else {
-	    str_ucnv_transcode_to_utf16(src_encoding_used,
+	// we need to transcode even if the source encoding is native UTF-16
+	// because it could contain invalid bytes
+	str_ucnv_transcode_to_utf16(src_encoding_used,
 		    self, &pos_in_src, &utf16, &utf16_length);
-	}
 
 	if (utf16_length > 0) {
 	    if ((behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT)
@@ -1398,7 +1257,7 @@
 			    break;
 			case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
 			    if (replacement_str->length_in_bytes > 0) {
-				str_concat_bytes(dst_str, replacement_str->data.bytes, replacement_str->length_in_bytes);
+				str_concat_bytes(dst_str, replacement_str->bytes, replacement_str->length_in_bytes);
 			    }
 			    break;
 			case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT:
@@ -1431,7 +1290,7 @@
 			char *bytes_list = xmalloc(invalid_bytes_length * 4);
 			char *bytes_list_pos = bytes_list;
 			for (long i = 0; i < invalid_bytes_length; ++i) {
-			    sprintf(bytes_list_pos, "\\x%02X", (unsigned char)self->data.bytes[pos_in_src+i]);
+			    sprintf(bytes_list_pos, "\\x%02X", (unsigned char)self->bytes[pos_in_src+i]);
 			    bytes_list_pos += 4;
 			}
 			rb_raise(rb_eInvalidByteSequenceError, "\"%s\" on %s", bytes_list, src_encoding->public_name);
@@ -1439,7 +1298,7 @@
 		    break;
 		case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
 		    if (replacement_str->length_in_bytes > 0) {
-			str_concat_bytes(dst_str, replacement_str->data.bytes, replacement_str->length_in_bytes);
+			str_concat_bytes(dst_str, replacement_str->bytes, replacement_str->length_in_bytes);
 		    }
 		    break;
 		default:
@@ -1457,11 +1316,6 @@
 	str_concat_ascii_cstr(dst_str, "\"");
     }
 
-    if (NATIVE_UTF16_ENC(dst_str->encoding)) {
-	str_set_stored_in_uchars(dst_str, true);
-    }
-
-
     return dst_str;
 }
 
@@ -1660,46 +1514,16 @@
     unsigned char c = 0;
     long idx = NUM2LONG(index);
 
-    if (str_is_stored_in_uchars(RSTR(self))
-	    && NATIVE_UTF16_ENC(RSTR(self)->encoding)) {
+    if (idx < 0) {
+	idx += RSTR(self)->length_in_bytes;
 	if (idx < 0) {
-	    idx += RSTR(self)->length_in_bytes;
-	    if (idx < 0) {
-		return Qnil;
-	    }
-	}
-	if (idx >= RSTR(self)->length_in_bytes) {
 	    return Qnil;
 	}
-	if (NATIVE_UTF16_ENC(RSTR(self)->encoding)) {
-	    c = RSTR(self)->data.bytes[idx];
-	}
-	else { // non native byte-order UTF-16
-	    if ((idx & 1) == 0) { // even
-		c = RSTR(self)->data.bytes[idx+1];
-	    }
-	    else { // odd
-		c = RSTR(self)->data.bytes[idx-1];
-	    }
-	}
     }
-    else {
-	// work with a binary string
-	// (UTF-16 strings could be converted to their binary form
-	//  on the fly but that would just add complexity)
-	str_make_data_binary(RSTR(self));
-
-	if (idx < 0) {
-	    idx += RSTR(self)->length_in_bytes;
-	    if (idx < 0) {
-		return Qnil;
-	    }
-	}
-	if (idx >= RSTR(self)->length_in_bytes) {
-	    return Qnil;
-	}
-	c = RSTR(self)->data.bytes[idx];
+    if (idx >= RSTR(self)->length_in_bytes) {
+	return Qnil;
     }
+    c = RSTR(self)->bytes[idx];
 
     return INT2FIX(c); 
 }
@@ -1715,7 +1539,6 @@
 rstr_setbyte(VALUE self, SEL sel, VALUE idx, VALUE value)
 {
     rstr_modify(self);
-    str_make_data_binary(RSTR(self));
 
     long index = NUM2LONG(idx);
     int byte = NUM2INT(value);
@@ -1726,7 +1549,7 @@
     if (index < 0) {
 	index += RSTR(self)->length_in_bytes;
     }
-    RSTR(self)->data.bytes[index] = byte;
+    RSTR(self)->bytes[index] = byte;
     return value;
 }
 
@@ -1741,8 +1564,7 @@
 static VALUE
 rstr_to_data(VALUE self, SEL sel)
 {
-    str_make_data_binary(RSTR(self));
-    CFDataRef data = CFDataCreate(NULL, (const UInt8 *)RSTR(self)->data.bytes,
+    CFDataRef data = CFDataCreate(NULL, (const UInt8 *)RSTR(self)->bytes,
 	    RSTR(self)->length_in_bytes); 
     CFMakeCollectable(data);
     return (VALUE)data;
@@ -1760,8 +1582,7 @@
 static VALUE
 rstr_pointer(VALUE self, SEL sel)
 {
-    str_make_data_binary(RSTR(self));
-    return rb_pointer_new("C", RSTR(self)->data.bytes,
+    return rb_pointer_new("C", RSTR(self)->bytes,
 	    RSTR(self)->length_in_bytes); 
 }
 
@@ -1777,15 +1598,8 @@
 {
     assert(IS_RSTR(str));
     if (enc != RSTR(str)->encoding) {
-	str_make_data_binary(RSTR(str));
-	if (NATIVE_UTF16_ENC(RSTR(str)->encoding)) {
-	    str_set_stored_in_uchars(RSTR(str), false);
-	}
 	RSTR(str)->encoding = enc;
-	str_unset_facultative_flags(RSTR(str));
-	if (NATIVE_UTF16_ENC(RSTR(str)->encoding)) {
-	    str_set_stored_in_uchars(RSTR(str), true);
-	}
+	str_reset_flags(RSTR(str));
     }
 }
 
@@ -2627,12 +2441,6 @@
     return Qfalse;
 }
 
-static VALUE
-rstr_is_stored_in_uchars(VALUE self, SEL sel)
-{
-    return str_is_stored_in_uchars(RSTR(self)) ? Qtrue : Qfalse;
-}
-
 /*
  *  call-seq:
  *     str.to_s     => str
@@ -2707,26 +2515,19 @@
 static VALUE
 str_inspect(rb_str_t *str, bool dump)
 {
-    const bool uchars = str_try_making_data_uchars(str);
-    const long len = uchars
-	? str_length(str) : str->length_in_bytes;
-
     VALUE result;
-    if (len == 0) {
+    if (str->length_in_bytes == 0) {
 	result = rb_str_new2("\"\"");
 	OBJ_INFECT(result, str);
 	return result;
     }
 
-    // Allocate an UTF-8 string with a good initial capacity.
-    // Binary strings will likely have most bytes escaped.
-    const long result_init_len =
-	BINARY_ENC(str->encoding) ? (len * 5) + 2 : len + 2;
+    const long result_init_len = str->length_in_bytes * 3 / 2;
     result = rb_unicode_str_new(NULL, result_init_len);
 
     inspect_append(result, '"', false);
     __block UChar32 prev = 0;
-    str_each_char(str, ^(UChar32 c, const char* char_start, long char_len, bool *stop) {
+    str_each_uchar32(str, ^(UChar32 c, long start_index, long char_len, bool *stop) {
 	bool print = iswprint(c);
 	if (dump && prev == '#') {
 	    inspect_append(result, prev, (c == '$' || c == '@' || c == '{'));
@@ -2766,7 +2567,7 @@
 	else {
 	    char buf[10];
 	    for (long i = 0; i < char_len; ++i) {
-		uint8_t byte = (uint8_t)char_start[i];
+		uint8_t byte = (uint8_t)str->bytes[start_index+i];
 		snprintf(buf, sizeof buf, "\\x%02X", byte);
 		char *p = buf;
 		while (*p != '\0') {
@@ -3978,42 +3779,60 @@
  *  changes were made.
  *  Note: case replacement is effective only in ASCII region.
  */
-
-#define CHAR_ITERATE(str, code) \
-    if (str_try_making_data_uchars(RSTR(str))) { \
-	for (long i = 0, count = BYTES_TO_UCHARS(RSTR(str)->length_in_bytes); \
-		i < count; i++) { \
-	    UChar __tmp, c; \
-	    __tmp = c = RSTR(str)->data.uchars[i]; \
-	    code; \
-	    if (__tmp != c) { \
-		RSTR(str)->data.uchars[i] = c; \
-	    } \
-	} \
-    } \
-    else { \
-	for (long i = 0, count = RSTR(str)->length_in_bytes; \
-		i < count; i++) { \
-	    char __tmp, c; \
-	    __tmp = c = RSTR(str)->data.bytes[i]; \
-	    code; \
-	    if (__tmp != c) { \
-		RSTR(str)->data.bytes[i] = c; \
-	    } \
-	} \
+typedef char (^change_case_callback_t)(char c, bool first_char);
+static bool
+rstr_change_case(VALUE str, change_case_callback_t callback)
+{
+    if (RSTR(str)->encoding->ascii_compatible) {
+	bool changed = false;
+	for (long i = 0; i < RSTR(str)->length_in_bytes; ++i) {
+	    char c = RSTR(str)->bytes[i];
+	    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
+		char new_c = callback(c, i == 0);
+		if (new_c != c) {
+		    changed = true;
+		    RSTR(str)->bytes[i] = new_c;
+		}
+	    }
+	}
+	return changed;
     }
+    else {
+	if (!IS_UTF32_ENC(RSTR(str)->encoding)
+		&& !IS_UTF16_ENC(RSTR(str)->encoding)) {
+	    abort(); // should not happen
+	}
+	__block bool changed = true;
+	str_each_uchar32(RSTR(str), ^(UChar32 c, long start_index, long char_len, bool *stop) {
+	    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
+		char new_c = callback(c, start_index == 0);
+		if (new_c != c) {
+		    changed = true;
+		    memset(&RSTR(str)->bytes[start_index], 0, char_len);
+		    if (RSTR(str)->encoding->little_endian) {
+			RSTR(str)->bytes[start_index] = new_c;
+		    }
+		    else {
+			RSTR(str)->bytes[start_index+char_len-1] = new_c;
+		    }
+		}
+	    }
+	});
+	return changed;
+    }
+}
 
 static VALUE
 rstr_downcase_bang(VALUE str, SEL sel)
 {
     rstr_modify(str);
 
-    bool changed = false;
-    CHAR_ITERATE(str,
+    bool changed = rstr_change_case(str, ^ char (char c, bool first_char) {
 	if (c >= 'A' && c <= 'Z') {
-	    c = 'a' + (c - 'A');
-	    changed = true; 
-	});
+	    return 'a' + (c - 'A');
+	}
+	return c;
+    });
 
     return changed ? str : Qnil;
 }
@@ -4052,12 +3871,12 @@
 {
     rstr_modify(str);
 
-    bool changed = false;
-    CHAR_ITERATE(str,
+    bool changed = rstr_change_case(str, ^ char (char c, bool first_char) {
 	if (c >= 'a' && c <= 'z') {
-	    c = 'A' + (c - 'a');
-	    changed = true; 
-	});
+	    return 'A' + (c - 'a');
+	}
+	return c;
+    });
 
     return changed ? str : Qnil;
 }
@@ -4096,16 +3915,15 @@
 {
     rstr_modify(str);
 
-    bool changed = false;
-    CHAR_ITERATE(str,
+    bool changed = rstr_change_case(str, ^ char (char c, bool first_char) {
 	if (c >= 'A' && c <= 'Z') {
-	    c = 'a' + (c - 'A');
-	    changed = true; 
+	    return 'a' + (c - 'A');
 	}
         else if (c >= 'a' && c <= 'z') {
-	    c = 'A' + (c - 'a');
-	    changed = true;
-	});
+	    return 'A' + (c - 'a');
+	}
+	return c;
+    });
 
     return changed ? str : Qnil;
 }
@@ -4149,18 +3967,17 @@
 {
     rstr_modify(str);
 
-    bool changed = false;
-    CHAR_ITERATE(str,
-        if (i == 0) {
+    bool changed = rstr_change_case(str, ^ char (char c, bool first_char) {
+        if (first_char) {
 	    if (c >= 'a' && c <= 'z') {
-		c = 'A' + (c - 'a');
-		changed = true;
+		return 'A' + (c - 'a');
 	    }
 	}
 	else if (c >= 'A' && c <= 'Z') {
-	    c = 'a' + (c - 'A');
-	    changed = true; 
-	});
+	    return 'a' + (c - 'A');
+	}
+	return c;
+    });
 
     return changed ? str : Qnil;
 }
@@ -4236,9 +4053,6 @@
     const long len = str_length(RSTR(str));
     long width = NUM2LONG(w);
     str = rb_str_new3(str);
-    if (str_is_stored_in_uchars(RSTR(padstr))) {
-	str_try_making_data_uchars(RSTR(str));
-    }
     if (width < 0 || width <= len) {
 	return str;
     }
@@ -4609,24 +4423,20 @@
 {
     RETURN_ENUMERATOR(str, 0, 0);
 
-    UChar *chars = NULL;
-    long chars_len = 0;
-    bool need_free = false;
-    rb_str_get_uchars(str, &chars, &chars_len, &need_free);
+    __block VALUE return_value = str;
 
-    for (long i = 0; i < chars_len; i++) {
-	VALUE charstr = rb_unicode_str_new(&chars[i], 1);
+    str_each_uchar32(RSTR(str), ^(UChar32 c, long start_index, long char_len, bool *stop) {
+	VALUE charstr = (VALUE)str_new_copy_of_part(RSTR(str),
+	    start_index, char_len);
 	rb_yield(charstr);
-	ENSURE_AND_RETURN_IF_BROKEN(
-	    if (need_free) free(chars)
-	);
-    }
+	VALUE v = rb_vm_pop_broken_value(); \
+	if (v != Qundef) {
+	    return_value = v;
+	    *stop = true;
+	}
+    });
 
-    if (need_free) {
-	free(chars);
-    }
-
-    return str;
+    return return_value;
 }
 
 /*
@@ -4646,12 +4456,10 @@
 static VALUE
 rstr_each_byte(VALUE str, SEL sel)
 {
-    str_make_data_binary(RSTR(str));
-
     RETURN_ENUMERATOR(str, 0, 0);
 
     for (long i = 0; i < RSTR(str)->length_in_bytes; i++) {
-	rb_yield(INT2FIX((unsigned char)RSTR(str)->data.bytes[i]));
+	rb_yield(INT2FIX((unsigned char)RSTR(str)->bytes[i]));
 	RETURN_IF_BROKEN();
     }
     return str;
@@ -4726,23 +4534,21 @@
 	return str;
     }
 
-    if (!str_try_making_data_uchars(RSTR(str))) {
-	rb_raise(rb_eArgError,
-		"cannot make receiver data as Unicode characters");
+    if (!RSTR(str)->encoding->ascii_compatible) {
+	rb_raise(rb_eArgError, "The encoding must be ASCII-compatible");
     }
 
-    UChar *chars_buf = (UChar *)malloc(RSTR(str)->length_in_bytes
-	    + sizeof(UChar));
-    UChar *chars_ptr = &chars_buf[1];
+    char *chars_buf = (char *)malloc(RSTR(str)->length_in_bytes + 1);
+    char *chars_ptr = &chars_buf[1];
 
-    memcpy(chars_ptr, RSTR(str)->data.uchars, RSTR(str)->length_in_bytes);
+    memcpy(chars_ptr, RSTR(str)->bytes, RSTR(str)->length_in_bytes);
 
-    long len = BYTES_TO_UCHARS(RSTR(str)->length_in_bytes);
-    UChar carry = 0;
+    long len = RSTR(str)->length_in_bytes;
+    char carry = 0;
     bool modified = false;
 
     for (long i = len - 1; i >= 0; i--) {
-	UChar c = chars_ptr[i];
+	char c = chars_ptr[i];
 	if (isdigit(c)) {
 	    modified = true;
 	    if (c != '9') {
@@ -4787,7 +4593,7 @@
 	len++;
     }
 
-    VALUE newstr = rb_unicode_str_new(chars_ptr, len);
+    VALUE newstr = rb_enc_str_new(chars_ptr, len, RSTR(str)->encoding);
     free(chars_buf);
     OBJ_INFECT(newstr, str);
 
@@ -4962,21 +4768,20 @@
 	return str;
     }
 
-    str_make_data_binary(RSTR(str));
     char *new_bytes = xmalloc(RSTR(str)->length_in_bytes);
     __block long pos = RSTR(str)->length_in_bytes;
-    str_each_char(RSTR(str), ^(UChar32 c, const char* char_start, long char_len, bool *stop) {
+    str_each_uchar32(RSTR(str), ^(UChar32 c, long start_index, long char_len, bool *stop) {
 	pos -= char_len;
-	memcpy(&new_bytes[pos], char_start, char_len);
+	memcpy(&new_bytes[pos], &RSTR(str)->bytes[start_index], char_len);
     });
     assert(pos == 0);
 
     RSTR(str)->capacity_in_bytes = RSTR(str)->length_in_bytes;
-    GC_WB(&RSTR(str)->data.bytes, new_bytes);
+    GC_WB(&RSTR(str)->bytes, new_bytes);
 
     // we modify it directly so the information stored
     // in the facultative flags might be outdated
-    str_unset_facultative_flags(RSTR(str));
+    str_reset_flags(RSTR(str));
 
     return str;
 }
@@ -5533,7 +5338,7 @@
 	    sum = rb_vm_call_simple(sum, selPLUS, LONG2FIX(sum0));
 	    sum0 = 0;
 	}
-	sum0 += (unsigned char)RSTR(str)->data.bytes[i];
+	sum0 += (unsigned char)RSTR(str)->bytes[i];
     }
     if (bits == 0) {
 	if (sum0 != 0) {
@@ -5723,17 +5528,38 @@
 {
     check_bounds(rcv, range.location + range.length, true);
     if (range.length > 0) {
-	if (str_try_making_data_uchars(RSTR(rcv))) {
-	    memcpy(buffer, &RSTR(rcv)->data.uchars[range.location],
-		    sizeof(UniChar) * range.length);
-	}
-	else {
-	    for (long i = range.location, j = 0;
-		    i < range.location + range.length;
-		    i++, j++) {
-		buffer[j] = RSTR(rcv)->data.bytes[i];
+	__block long pos_in_src = 0;
+	__block long pos_in_dst = 0;
+	str_each_uchar32(RSTR(rcv), ^(UChar32 c, long start_index, long char_len, bool *stop) {
+	    if (pos_in_src >= range.location) {
+		if (c == U_SENTINEL) {
+		    if (char_len == 1) {
+			buffer[pos_in_dst++] = RSTR(rcv)->bytes[start_index];
+		    }
+		    else {
+			abort(); // TODO
+		    }
+		}
+		else if (U_IS_BMP(c)) {
+		    buffer[pos_in_dst++] = c;
+		}
+		else {
+		    buffer[pos_in_dst++] = U16_LEAD(c);
+		    if (pos_in_dst < range.length) {
+			buffer[pos_in_dst++] = U16_TRAIL(c);
+		    }
+		}
 	    }
-	}
+	    if ((c == U_SENTINEL) || U_IS_BMP(c)) {
+		pos_in_src++;
+	    }
+	    else {
+		pos_in_src += 2;
+	    }
+	    if (pos_in_dst >= range.length) {
+		*stop = true;
+	    }
+	});
     }
 }
 
@@ -5882,10 +5708,6 @@
     // MacRuby extensions.
     rb_objc_define_method(rb_cRubyString, "transform", rstr_transform, 1);
 
-    // MacRuby extensions (debugging).
-    rb_objc_define_method(rb_cRubyString, "__stored_in_uchars__?",
-	    rstr_is_stored_in_uchars, 0);
-
     // Cocoa primitives.
     rb_objc_install_method2((Class)rb_cRubyString, "length",
 	    (IMP)rstr_imp_length);
@@ -5928,8 +5750,6 @@
 
 // ByteString emulation.
 
-#define IS_BSTR(obj) (IS_RSTR(obj) && !str_is_stored_in_uchars(RSTR(obj)))
-
 VALUE
 rb_str_bstr(VALUE str)
 {
@@ -5957,15 +5777,14 @@
 	    free(buf);
 	}
     }
-    str_make_data_binary(RSTR(str));
     return str;
 }
 
 uint8_t *
 rb_bstr_bytes(VALUE str)
 {
-    assert(IS_BSTR(str));
-    return (uint8_t *)RSTR(str)->data.bytes;
+    assert(IS_RSTR(str));
+    return (uint8_t *)RSTR(str)->bytes;
 }
 
 VALUE
@@ -5986,21 +5805,21 @@
 long
 rb_bstr_length(VALUE str)
 {
-    assert(IS_BSTR(str));
+    assert(IS_RSTR(str));
     return RSTR(str)->length_in_bytes;
 }
 
 void
 rb_bstr_concat(VALUE str, const uint8_t *bytes, long len)
 {
-    assert(IS_BSTR(str));
+    assert(IS_RSTR(str));
     str_concat_bytes(RSTR(str), (const char *)bytes, len);
 }
 
 void
 rb_bstr_resize(VALUE str, long capa)
 {
-    assert(IS_BSTR(str));
+    assert(IS_RSTR(str));
     str_resize_bytes(RSTR(str), capa);
     RSTR(str)->length_in_bytes = capa;
 }
@@ -6008,7 +5827,7 @@
 void
 rb_bstr_set_length(VALUE str, long len)
 {
-    assert(IS_BSTR(str));
+    assert(IS_RSTR(str));
     assert(len <= RSTR(str)->capacity_in_bytes);
     RSTR(str)->length_in_bytes = len;
 }
@@ -6159,9 +5978,8 @@
 	if (RSTR(str)->length_in_bytes == 0) {
 	    return "";
 	}
-	str_make_data_binary(RSTR(str));
 	str_ensure_null_terminator(RSTR(str));
-	return RSTR(str)->data.bytes;
+	return RSTR(str)->bytes;
     }
 
     // CFString code path, hopefully this should not happen very often.
@@ -6187,7 +6005,6 @@
 rb_str_clen(VALUE str)
 {
     if (IS_RSTR(str)) {
-	str_make_data_binary(RSTR(str));
 	return RSTR(str)->length_in_bytes;
     }
     return CFStringGetLength((CFStringRef)str);
@@ -6431,10 +6248,8 @@
 rb_str_set_len(VALUE str, long len)
 {
     if (IS_RSTR(str)) {
-	const long len_bytes = str_is_stored_in_uchars(RSTR(str))
-	    ? UCHARS_TO_BYTES(len) : len;
-	assert(len_bytes <= RSTR(str)->length_in_bytes);
-	RSTR(str)->length_in_bytes = len_bytes;
+	assert(len <= RSTR(str)->length_in_bytes);
+	RSTR(str)->length_in_bytes = len;
     }
     else {
 	abort(); // TODO

Modified: MacRuby/trunk/transcode.c
===================================================================
--- MacRuby/trunk/transcode.c	2010-12-19 03:10:47 UTC (rev 5053)
+++ MacRuby/trunk/transcode.c	2010-12-19 06:13:10 UTC (rev 5054)
@@ -68,7 +68,7 @@
     if ((enc == NULL) || (enc->ascii_compatible)) {
         return Qnil;
     }
-    else if (UTF16_ENC(enc) || UTF32_ENC(enc)) {
+    else if (IS_UTF16_ENC(enc) || IS_UTF32_ENC(enc)) {
         return (VALUE)rb_utf8_encoding();
     }
     // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.

Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c	2010-12-19 03:10:47 UTC (rev 5053)
+++ MacRuby/trunk/ucnv.c	2010-12-19 06:13:10 UTC (rev 5054)
@@ -31,15 +31,12 @@
 void
 str_ucnv_update_flags(rb_str_t *self)
 {
-    assert(!str_is_stored_in_uchars(self));
-
     USE_CONVERTER(cnv, self->encoding);
 
     bool ascii_only = true;
     bool valid_encoding = true;
-    bool has_supplementary = false;
 
-    const char *pos = self->data.bytes;
+    const char *pos = self->bytes;
     const char *end = pos + self->length_in_bytes;
     for (;;) {
 	// iterate through the string one Unicode code point at a time
@@ -54,64 +51,31 @@
 		// conversion error
 		valid_encoding = false;
 		ascii_only = false;
+		break;
 	    }
 	}
 	else {
 	    if (c > 127) {
 		ascii_only = false;
-		if (U_IS_SUPPLEMENTARY(c)) {
-		    has_supplementary = true;
-		}
 	    }
 	}
     }
 
     ucnv_close(cnv);
 
-    str_set_has_supplementary(self, has_supplementary);
     str_set_valid_encoding(self, valid_encoding);
     str_set_ascii_only(self, ascii_only);
 }
 
-void
-str_ucnv_make_data_binary(rb_str_t *self)
-{
-    assert(str_is_stored_in_uchars(self));
-
-    USE_CONVERTER(cnv, self->encoding);
-
-    UErrorCode err = U_ZERO_ERROR;
-    long capa = UCNV_GET_MAX_BYTES_FOR_STRING(BYTES_TO_UCHARS(
-		self->length_in_bytes), ucnv_getMaxCharSize(cnv));
-    char *buffer = xmalloc(capa);
-    const UChar *source_pos = self->data.uchars;
-    const UChar *source_end = self->data.uchars
-	+ BYTES_TO_UCHARS(self->length_in_bytes);
-    char *target_pos = buffer;
-    char *target_end = buffer + capa;
-    ucnv_fromUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
-	    NULL, true, &err);
-    // there should never be any conversion error here
-    // (if there's one it means some checking has been forgotten before)
-    assert(U_SUCCESS(err));
-
-    ucnv_close(cnv);
-
-    str_set_stored_in_uchars(self, false);
-    self->capacity_in_bytes = capa;
-    self->length_in_bytes = target_pos - buffer;
-    GC_WB(&self->data.bytes, buffer);
-}
-
 static long
 utf16_bytesize_approximation(rb_encoding_t *enc, int bytesize)
 {
     long approximation;
-    if (UTF16_ENC(enc)) {
+    if (IS_UTF16_ENC(enc)) {
 	approximation = bytesize; // the bytesize in UTF-16 is the same
 				  // whatever the endianness
     }
-    else if (UTF32_ENC(enc)) {
+    else if (IS_UTF32_ENC(enc)) {
 	// the bytesize in UTF-16 is nearly half of the bytesize in UTF-32
 	// (if there characters not in the BMP it's a bit more though)
 	approximation = bytesize / 2;
@@ -129,60 +93,12 @@
     return approximation;
 }
 
-bool
-str_ucnv_try_making_data_uchars(rb_str_t *self)
-{
-    assert(!str_is_stored_in_uchars(self));
-
-    USE_CONVERTER(cnv, self->encoding);
-
-    long capa = utf16_bytesize_approximation(self->encoding,
-	    self->length_in_bytes);
-    const char *source_pos = self->data.bytes;
-    const char *source_end = self->data.bytes + self->length_in_bytes;
-    UChar *buffer = xmalloc(capa);
-    UChar *target_pos = buffer;
-    UErrorCode err = U_ZERO_ERROR;
-    for (;;) {
-	UChar *target_end = buffer + BYTES_TO_UCHARS(capa);
-	err = U_ZERO_ERROR;
-	ucnv_toUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
-		NULL, true, &err);
-	if (err == U_BUFFER_OVERFLOW_ERROR) {
-	    long index = target_pos - buffer;
-	    capa *= 2; // double the buffer's size
-	    buffer = xrealloc(buffer, capa);
-	    target_pos = buffer + index;
-	}
-	else {
-	    break;
-	}
-    }
-
-    ucnv_close(cnv);
-
-    if (U_SUCCESS(err)) {
-	str_set_valid_encoding(self, true);
-	str_set_stored_in_uchars(self, true);
-	self->capacity_in_bytes = capa;
-	self->length_in_bytes = UCHARS_TO_BYTES(target_pos - buffer);
-	GC_WB(&self->data.uchars, buffer);
-	return true;
-    }
-    else {
-	str_set_valid_encoding(self, false);
-	return false;
-    }
-}
-
 long
 str_ucnv_length(rb_str_t *self, bool ucs2_mode)
 {
-    assert(!str_is_stored_in_uchars(self));
-
     USE_CONVERTER(cnv, self->encoding);
 
-    const char *pos = self->data.bytes;
+    const char *pos = self->bytes;
     const char *end = pos + self->length_in_bytes;
     long len = 0;
     bool valid_encoding = true;
@@ -222,14 +138,12 @@
 void rb_ensure_b(void (^b_block)(void), void (^e_block)(void));
 
 void
-str_ucnv_each_char(rb_str_t *self, each_char_callback_t callback)
+str_ucnv_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback)
 {
-    assert(!str_is_stored_in_uchars(self));
-
     USE_CONVERTER(cnv, self->encoding);
 
     rb_ensure_b(^{
-	const char *pos = self->data.bytes;
+	const char *pos = self->bytes;
 	const char *end = pos + self->length_in_bytes;
 	bool stop = false;
 	for (;;) {
@@ -248,7 +162,7 @@
 		    if (char_len > min_char_size) {
 			char_len = min_char_size;
 		    }
-		    callback(U_SENTINEL, char_start_pos, char_len, &stop);
+		    callback(U_SENTINEL, char_start_pos-self->bytes, char_len, &stop);
 		    if (stop) {
 			return;
 		    }
@@ -257,7 +171,7 @@
 	    }
 	    else {
 		long char_len = pos - char_start_pos;
-		callback(c, char_start_pos, char_len, &stop);
+		callback(c, char_start_pos-self->bytes, char_len, &stop);
 		if (stop) {
 		    return;
 		}
@@ -268,48 +182,9 @@
     });
 }
 
-
-#define STACK_BUFFER_SIZE 1024
-long
-str_ucnv_bytesize(rb_str_t *self)
-{
-    assert(str_is_stored_in_uchars(self));
-
-    // for strings stored in UTF-16 for which the Ruby encoding is not UTF-16,
-    // we have to convert back the string in its original encoding to get the
-    // length in bytes
-    USE_CONVERTER(cnv, self->encoding);
-
-    UErrorCode err = U_ZERO_ERROR;
-
-    long len = 0;
-    char buffer[STACK_BUFFER_SIZE];
-    const UChar *source_pos = self->data.uchars;
-    const UChar *source_end = self->data.uchars + BYTES_TO_UCHARS(
-	    self->length_in_bytes);
-    char *target_end = buffer + STACK_BUFFER_SIZE;
-    for (;;) {
-	err = U_ZERO_ERROR;
-	char *target_pos = buffer;
-	ucnv_fromUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
-		NULL, true, &err);
-	len += target_pos - buffer;
-	if (err != U_BUFFER_OVERFLOW_ERROR) {
-	    // if the convertion failed, a check was missing somewhere
-	    assert(U_SUCCESS(err));
-	    break;
-	}
-    }
-
-    ucnv_close(cnv);
-    return len;
-}
-
 character_boundaries_t
 str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode)
 {
-    assert(!str_is_stored_in_uchars(self));
-
     character_boundaries_t boundaries = {-1, -1};
 
     if (index < 0) {
@@ -323,7 +198,7 @@
     // the code has many similarities with str_length
     USE_CONVERTER(cnv, self->encoding);
 
-    const char *pos = self->data.bytes;
+    const char *pos = self->bytes;
     const char *end = pos + self->length_in_bytes;
     long current_index = 0;
     for (;;) {
@@ -336,7 +211,7 @@
 	    // end of the string
 	    break;
 	}
-	long offset_in_bytes = character_start_pos - self->data.bytes;
+	long offset_in_bytes = character_start_pos - self->bytes;
 	long converted_width = pos - character_start_pos;
 	if (U_FAILURE(err)) {
 	    long min_char_size = self->encoding->min_char_size;
@@ -408,12 +283,10 @@
 str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
 	bool ucs2_mode)
 {
-    assert(!str_is_stored_in_uchars(self));
-
     // the code has many similarities with str_length
     USE_CONVERTER(cnv, self->encoding);
 
-    const char *current_position = self->data.bytes;
+    const char *current_position = self->bytes;
     const char *searched_position = current_position + offset_in_bytes;
     const char *end = current_position + self->length_in_bytes;
     long index = 0;
@@ -470,8 +343,8 @@
 
     long capa = utf16_bytesize_approximation(src_enc,
 	    self->length_in_bytes);
-    const char *source_pos = self->data.bytes + *pos;
-    const char *source_end = self->data.bytes + self->length_in_bytes;
+    const char *source_pos = self->bytes + *pos;
+    const char *source_end = self->bytes + self->length_in_bytes;
     UChar *buffer = xmalloc(capa);
     UChar *target_pos = buffer;
     UErrorCode err = U_ZERO_ERROR;
@@ -495,7 +368,7 @@
 
     *utf16 = buffer;
     *utf16_length = target_pos - buffer;
-    *pos = source_pos - self->data.bytes;
+    *pos = source_pos - self->bytes;
 
     if (U_FAILURE(err)) {
 	// the invalid character will be skipped by str_transcode
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20101218/7d3b88d8/attachment-0001.html>


More information about the macruby-changes mailing list