[macruby-changes] [4052] MacRuby/trunk

source_changes at macosforge.org source_changes at macosforge.org
Sat May 8 22:47:19 PDT 2010


Revision: 4052
          http://trac.macosforge.org/projects/ruby/changeset/4052
Author:   vincent.isambart at gmail.com
Date:     2010-05-08 22:47:14 -0700 (Sat, 08 May 2010)
Log Message:
-----------
an (incomplete) implementation of String#encode

Modified Paths:
--------------
    MacRuby/trunk/encoding.c
    MacRuby/trunk/encoding.h
    MacRuby/trunk/error.c
    MacRuby/trunk/include/ruby/ruby.h
    MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt
    MacRuby/trunk/string.c
    MacRuby/trunk/ucnv.c

Modified: MacRuby/trunk/encoding.c
===================================================================
--- MacRuby/trunk/encoding.c	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/encoding.c	2010-05-09 05:47:14 UTC (rev 4052)
@@ -17,7 +17,7 @@
 
 VALUE rb_cEncoding;
 
-static rb_encoding_t *default_internal = NULL;
+rb_encoding_t *default_internal = NULL;
 static rb_encoding_t *default_external = NULL;
 rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
 
@@ -28,6 +28,8 @@
 static long str_undefined_bytesize(rb_str_t *self) { abort(); }
 static character_boundaries_t str_undefined_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode) { abort(); }
 static long str_undefined_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode) { abort(); }
+static void str_undefined_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length) { abort(); }
+static void str_undefined_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *pos, char **bytes, long *bytes_length) { abort(); }
 
 static VALUE
 mr_enc_s_list(VALUE klass, SEL sel)
@@ -235,6 +237,10 @@
 	str_undefined_get_character_boundaries;
     encoding->methods.offset_in_bytes_to_index =
 	str_undefined_offset_in_bytes_to_index;
+    encoding->methods.transcode_to_utf16 =
+	str_undefined_transcode_to_utf16;
+    encoding->methods.transcode_from_utf16 =
+	str_undefined_transcode_from_utf16;
 
     switch (rb_encoding_type) {
 	case ENCODING_TYPE_SPECIAL:

Modified: MacRuby/trunk/encoding.h
===================================================================
--- MacRuby/trunk/encoding.h	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/encoding.h	2010-05-09 05:47:14 UTC (rev 4052)
@@ -116,6 +116,8 @@
     long (*bytesize)(rb_str_t *);
     character_boundaries_t (*get_character_boundaries)(rb_str_t *, long, bool);
     long (*offset_in_bytes_to_index)(rb_str_t *, long, bool);
+    void (*transcode_to_utf16)(struct rb_encoding *, rb_str_t *, long *, UChar **, long *);
+    void (*transcode_from_utf16)(struct rb_encoding *, UChar *, long, long *, char **, long *);
 } encoding_methods_t;
 
 typedef struct rb_encoding {

Modified: MacRuby/trunk/error.c
===================================================================
--- MacRuby/trunk/error.c	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/error.c	2010-05-09 05:47:14 UTC (rev 4052)
@@ -342,6 +342,9 @@
 VALUE rb_eNameError;
 VALUE rb_eEncodingError;
 VALUE rb_eEncCompatError;
+VALUE rb_eUndefinedConversionError;
+VALUE rb_eInvalidByteSequenceError;
+VALUE rb_eConverterNotFoundError;
 VALUE rb_eNoMethodError;
 VALUE rb_eSecurityError;
 VALUE rb_eNotImpError;
@@ -1094,6 +1097,9 @@
     rb_eNoMemError = rb_define_class("NoMemoryError", rb_eException);
     rb_eEncodingError = rb_define_class("EncodingError", rb_eStandardError);
     rb_eEncCompatError = rb_define_class_under(rb_cEncoding, "CompatibilityError", rb_eEncodingError);
+    rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
+    rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
+    rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
 
     syserr_tbl = st_init_numtable();
     GC_ROOT(&syserr_tbl);

Modified: MacRuby/trunk/include/ruby/ruby.h
===================================================================
--- MacRuby/trunk/include/ruby/ruby.h	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/include/ruby/ruby.h	2010-05-09 05:47:14 UTC (rev 4052)
@@ -959,6 +959,9 @@
 RUBY_EXTERN VALUE rb_eRegexpError;
 RUBY_EXTERN VALUE rb_eEncodingError;
 RUBY_EXTERN VALUE rb_eEncCompatError;
+RUBY_EXTERN VALUE rb_eUndefinedConversionError;
+RUBY_EXTERN VALUE rb_eInvalidByteSequenceError;
+RUBY_EXTERN VALUE rb_eConverterNotFoundError;
 
 RUBY_EXTERN VALUE rb_eScriptError;
 RUBY_EXTERN VALUE rb_eNameError;

Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt
===================================================================
--- MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt	2010-05-09 05:47:14 UTC (rev 4052)
@@ -1,30 +1,19 @@
 fails:String#encode! transcodes to the default internal encoding with no argument
-fails:String#encode! returns self when called with only a target encoding
-fails:String#encode! tags the String with the given encoding
 fails:String#encode! transcodes self to the given encoding
 fails:String#encode! can convert between encodings where a multi-stage conversion path is needed
 fails:String#encode! raises an Encoding::InvalidByteSequenceError for invalid byte sequences
 fails:String#encode! raises UndefinedConversionError if the String contains characters invalid for the target     encoding
 fails:String#encode! raises Encoding::ConverterNotFoundError for invalid target encodings
-fails:String#encode! raises a RuntimeError when called on a frozen String
-fails:String#encode! raises a RuntimeError when called on a frozen String when it's a no-op
 fails:String#encode transcodes to the default internal encoding with no argument
-fails:String#encode returns a copy of self when called with only a target encoding
 fails:String#encode returns self when called with only a target encoding
-fails:String#encode returns a copy of self even when no changes are made
-fails:String#encode returns a String with the given encoding
 fails:String#encode transcodes self to the given encoding
 fails:String#encode can convert between encodings where a multi-stage conversion path is needed
 fails:String#encode raises an Encoding::InvalidByteSequenceError for invalid byte sequences
 fails:String#encode raises UndefinedConversionError if the String contains characters invalid for the target     encoding
 fails:String#encode raises Encoding::ConverterNotFoundError for invalid target encodings
-fails:String#encode! replaces invalid characters
 fails:String#encode! replaces undefined characters
 fails:String#encode! replaces xml characters
 fails:String#encode! replaces xml characters and quotes the result
-fails:String#encode replaces invalid characters
 fails:String#encode replaces undefined characters
 fails:String#encode replaces xml characters
 fails:String#encode replaces xml characters and quotes the result
-fails:String#encode with no arguments returns a copy of self transcoded to Encoding.default_internal
-fails:String#encode with no arguments returns a copy of self

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/string.c	2010-05-09 05:47:14 UTC (rev 4052)
@@ -1242,6 +1242,141 @@
     }
 }
 
+enum {
+    TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+    TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING,
+    TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT,
+    TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR
+};
+static rb_str_t *
+str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
+	int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str)
+{
+    if ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+	   || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)) {
+	assert(replacement_str != NULL);
+	assert(replacement_str->encoding != NULL);
+	assert(replacement_str->encoding == dst_encoding);
+    }
+
+    rb_str_t *dst_str = str_alloc(rb_cRubyString);
+    dst_str->encoding = dst_encoding;
+
+    if (self->length_in_bytes == 0) {
+	return dst_str;
+    }
+
+    if (src_encoding == self->encoding) {
+	// if the string can already be converted in UTF-16, half the job is done
+	str_try_making_data_uchars(self);
+    }
+    else {
+	// if the source encoding is not the string encoding
+	// we must be sure to start from the bytes, not UTF-16
+	str_make_data_binary(self);
+    }
+
+    rb_encoding_t *src_encoding_used;
+    rb_encoding_t *dst_encoding_used;
+    if (BINARY_ENC(dst_encoding)) {
+	dst_encoding_used = rb_encodings[ENCODING_ASCII];
+    }
+    else {
+	dst_encoding_used = dst_encoding;
+    }
+    if (BINARY_ENC(src_encoding)) {
+	src_encoding_used = rb_encodings[ENCODING_ASCII];
+    }
+    else {
+	src_encoding_used = src_encoding;
+    }
+
+    long pos_in_src = 0;
+    for (;;) {
+	UChar *utf16;
+	long utf16_length;
+	// if the encoding is native UTF-16 it's always stored in UChars
+	// but it can contain invalid bytes
+	if (str_is_stored_in_uchars(self) && !NATIVE_UTF16_ENC(self->encoding)) {
+	    utf16 = self->data.uchars;
+	    utf16_length = BYTES_TO_UCHARS(self->length_in_bytes);
+	    pos_in_src = self->length_in_bytes;
+	}
+	else {
+	    src_encoding_used->methods.transcode_to_utf16(src_encoding_used,
+		    self, &pos_in_src, &utf16, &utf16_length);
+	}
+
+	if (utf16_length > 0) {
+	    long utf16_pos = 0;
+	    for (;;) {
+		long bytes_length;
+		char *bytes;
+		dst_encoding_used->methods.transcode_from_utf16(dst_encoding_used,
+			utf16, utf16_length, &utf16_pos, &bytes, &bytes_length);
+		if (bytes_length > 0) {
+		    str_concat_bytes(dst_str, bytes, bytes_length);
+		}
+		if (utf16_pos < utf16_length) {
+		    // undefined char
+		    UChar32 c;
+		    U16_NEXT(utf16, utf16_pos, utf16_length, c);
+		    switch (behavior_for_undefined) {
+			case TRANSCODE_BEHAVIOR_RAISE_EXCEPTION:
+			    rb_raise(rb_eUndefinedConversionError, "U+%04X from %s to %s", c, src_encoding->public_name, dst_encoding->public_name);
+			    break;
+			case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
+			    str_concat_bytes(dst_str, replacement_str->data.bytes, replacement_str->length_in_bytes);
+			    break;
+			case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT:
+			case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR:
+			    break;
+			default:
+			    abort();
+		    }
+		}
+		if (utf16_pos == utf16_length) {
+		    break;
+		}
+	    }
+	}
+
+	if (pos_in_src < self->length_in_bytes) {
+	    // invalid bytes
+	    long invalid_bytes_length = src_encoding->min_char_size;
+	    if (invalid_bytes_length + pos_in_src > self->length_in_bytes) {
+		invalid_bytes_length = self->length_in_bytes - pos_in_src;
+	    }
+	    switch (behavior_for_invalid) {
+		case TRANSCODE_BEHAVIOR_RAISE_EXCEPTION:
+		    {
+			char *bytes_list = xmalloc(invalid_bytes_length * 4);
+			char *bytes_list_pos = bytes_list;
+			for (long i = 0; i < invalid_bytes_length; ++i) {
+			    sprintf(bytes_list_pos, "\\x%02X", (unsigned char)self->data.bytes[pos_in_src+i]);
+			    bytes_list_pos += 4;
+			}
+			rb_raise(rb_eInvalidByteSequenceError, "\"%s\" on %s", bytes_list, src_encoding->public_name);
+		    }
+		    break;
+		case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
+		    str_concat_bytes(dst_str, replacement_str->data.bytes, replacement_str->length_in_bytes);
+		    break;
+		default:
+		    abort();
+	    }
+	    pos_in_src += invalid_bytes_length;
+	}
+
+	if (pos_in_src == self->length_in_bytes) {
+	    break;
+	}
+    }
+
+    return dst_str;
+}
+
+
 //----------------------------------------------
 // Functions called by MacRuby
 
@@ -1586,6 +1721,170 @@
 
 /*
  *  call-seq:
+ *     str.encode(encoding [, options] )   => str
+ *     str.encode(dst_encoding, src_encoding [, options] )   => str
+ *     str.encode([options])   => str
+ *
+ *  The first form returns a copy of <i>str</i> transcoded
+ *  to encoding +encoding+.
+ *  The second form returns a copy of <i>str</i> transcoded
+ *  from src_encoding to dst_encoding.
+ *  The last form returns a copy of <i>str</i> transcoded to
+ *  <code>Encoding.default_internal</code>.
+ *  By default, the first and second form raise
+ *  Encoding::UndefinedConversionError for characters that are
+ *  undefined in the destination encoding, and
+ *  Encoding::InvalidByteSequenceError for invalid byte sequences
+ *  in the source encoding. The last form by default does not raise
+ *  exceptions but uses replacement strings.
+ *  The <code>options</code> Hash gives details for conversion.
+ *
+ *  === options
+ *  The hash <code>options</code> can have the following keys:
+ *  :invalid ::
+ *    If the value is <code>:replace</code>, <code>#encode</code> replaces
+ *    invalid byte sequences in <code>str</code> with the replacement character.
+ *    The default is to raise the exception
+ *  :undef ::
+ *    If the value is <code>:replace</code>, <code>#encode</code> replaces
+ *    characters which are undefined in the destination encoding with
+ *    the replacement character.
+ *  :replace ::
+ *    Sets the replacement string to the value. The default replacement
+ *    string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
+ *  :xml ::
+ *    The value must be <code>:text</code> or <code>:attr</code>.
+ *    If the value is <code>:text</code> <code>#encode</code> replaces
+ *    undefined characters with their (upper-case hexadecimal) numeric
+ *    character references. '&', '<', and '>' are converted to "&amp;",
+ *    "&lt;", and "&gt;", respectively.
+ *    If the value is <code>:attr</code>, <code>#encode</code> also quotes
+ *    the replacement result (using '"'), and replaces '"' with "&quot;".
+ */
+extern rb_encoding_t *default_internal;
+static VALUE
+rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+    VALUE opt = Qnil;
+    if (argc > 0) {
+        opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
+        if (!NIL_P(opt)) {
+            argc--;
+        }
+    }
+
+    rb_str_t *self = RSTR(str);
+    rb_str_t *replacement_str = NULL;
+    rb_encoding_t *src_encoding, *dst_encoding;
+    int behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+    int behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+    if (argc == 0) {
+	src_encoding = self->encoding;
+	dst_encoding = default_internal;
+	behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+	behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+    }
+    else if (argc == 1) {
+	src_encoding = self->encoding;
+	dst_encoding = rb_to_encoding(argv[0]);
+    }
+    else if (argc == 2) {
+	dst_encoding = rb_to_encoding(argv[0]);
+	src_encoding = rb_to_encoding(argv[1]);
+    }
+    else {
+	rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
+    }
+
+    if (!NIL_P(opt)) {
+	VALUE invalid_val = rb_hash_aref(opt, ID2SYM(rb_intern("invalid")));
+	VALUE replace_sym = ID2SYM(rb_intern("replace"));
+	if (invalid_val == replace_sym) {
+	    behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+	}
+	VALUE undefined_val = rb_hash_aref(opt, ID2SYM(rb_intern("undefined")));
+	if (undefined_val == replace_sym) {
+	    behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+	}
+	VALUE xml_val = rb_hash_aref(opt, ID2SYM(rb_intern("xml")));
+	if (xml_val == ID2SYM(rb_intern("text"))) {
+	    behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
+	}
+	else if (xml_val == ID2SYM(rb_intern("attr"))) {
+	    behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
+	}
+
+	VALUE replacement = rb_hash_aref(opt, replace_sym);
+	if (!NIL_P(replacement)) {
+	    replacement_str = str_need_string(replacement);
+	    if (replacement_str->encoding != dst_encoding) {
+		replacement_str = str_transcode(replacement_str, replacement_str->encoding,
+			dst_encoding, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+			TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
+	    }
+	    if ((behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+		    && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
+		behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+	    }
+	}
+    }
+
+    if ((replacement_str == NULL)
+	    && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+		|| (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
+	if (dst_encoding == rb_encodings[ENCODING_UTF16BE]) {
+	    replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, dst_encoding));
+	}
+	else if (dst_encoding == rb_encodings[ENCODING_UTF32BE]) {
+	    replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, dst_encoding));
+	}
+	else if (dst_encoding == rb_encodings[ENCODING_UTF16LE]) {
+	    replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, dst_encoding));
+	}
+	else if (dst_encoding == rb_encodings[ENCODING_UTF32LE]) {
+	    replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, dst_encoding));
+	}
+	else if (dst_encoding == rb_encodings[ENCODING_UTF8]) {
+	    replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, dst_encoding));
+	}
+	else {
+	    replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
+	    replacement_str = str_transcode(replacement_str, replacement_str->encoding,
+		    dst_encoding, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+		    TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
+	}
+    }
+
+    return (VALUE)str_transcode(self, src_encoding, dst_encoding,
+	    behavior_for_invalid, behavior_for_undefined, replacement_str);
+}
+
+/*
+ *  call-seq:
+ *     str.encode!(encoding [, options] )   => str
+ *     str.encode!(dst_encoding, src_encoding [, options] )   => str
+ *
+ *  The first form transcodes the contents of <i>str</i> from
+ *  str.encoding to +encoding+.
+ *  The second form transcodes the contents of <i>str</i> from
+ *  src_encoding to dst_encoding.
+ *  The options Hash gives details for conversion. See String#encode
+ *  for details.
+ *  Returns the string even if no changes were made.
+ */
+static VALUE
+rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+    rstr_modify(str);
+
+    VALUE new_str = rstr_encode(str, sel, argc, argv);
+    str_replace_with_string(RSTR(str), RSTR(new_str));
+    return str;
+}
+
+
+/*
+ *  call-seq:
  *     str[fixnum]                 => new_str or nil
  *     str[fixnum, fixnum]         => new_str or nil
  *     str[range]                  => new_str or nil
@@ -5533,6 +5832,8 @@
     rb_objc_define_method(rb_cRubyString, "partition", rstr_partition, 1);
     rb_objc_define_method(rb_cRubyString, "rpartition", rstr_rpartition, 1);
     rb_objc_define_method(rb_cRubyString, "crypt", rstr_crypt, 1);
+    rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
+    rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
 
     // MacRuby extensions.
     rb_objc_define_method(rb_cRubyString, "transform", rstr_transform, 1);

Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c	2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/ucnv.c	2010-05-09 05:47:14 UTC (rev 4052)
@@ -15,13 +15,13 @@
 
 // do not forget to close the converter
 // before leaving the function
-#define USE_CONVERTER(cnv, str) \
-    assert(str->encoding->private_data != NULL); \
+#define USE_CONVERTER(cnv, encoding) \
+    assert(encoding->private_data != NULL); \
     char cnv##_buffer[U_CNV_SAFECLONE_BUFFERSIZE]; \
     UErrorCode cnv##_err = U_ZERO_ERROR; \
     int32_t cnv##_buffer_size = U_CNV_SAFECLONE_BUFFERSIZE; \
     UConverter *cnv = ucnv_safeClone( \
-	    (UConverter *)str->encoding->private_data, \
+	    (UConverter *)encoding->private_data, \
 	    cnv##_buffer, \
 	    &cnv##_buffer_size, \
 	    &cnv##_err \
@@ -33,7 +33,7 @@
 {
     assert(!str_is_stored_in_uchars(self));
 
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     bool ascii_only = true;
     bool valid_encoding = true;
@@ -78,7 +78,7 @@
 {
     assert(str_is_stored_in_uchars(self));
 
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     UErrorCode err = U_ZERO_ERROR;
     long capa = UCNV_GET_MAX_BYTES_FOR_STRING(BYTES_TO_UCHARS(
@@ -134,7 +134,7 @@
 {
     assert(!str_is_stored_in_uchars(self));
 
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     long capa = utf16_bytesize_approximation(self->encoding,
 	    self->length_in_bytes);
@@ -180,7 +180,7 @@
 {
     assert(!str_is_stored_in_uchars(self));
 
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     const char *pos = self->data.bytes;
     const char *end = pos + self->length_in_bytes;
@@ -227,7 +227,7 @@
     // for strings stored in UTF-16 for which the Ruby encoding is not UTF-16,
     // we have to convert back the string in its original encoding to get the
     // length in bytes
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     UErrorCode err = U_ZERO_ERROR;
 
@@ -270,7 +270,7 @@
     }
 
     // the code has many similarities with str_length
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     const char *pos = self->data.bytes;
     const char *end = pos + self->length_in_bytes;
@@ -360,7 +360,7 @@
     assert(!str_is_stored_in_uchars(self));
 
     // the code has many similarities with str_length
-    USE_CONVERTER(cnv, self);
+    USE_CONVERTER(cnv, self->encoding);
 
     const char *current_position = self->data.bytes;
     const char *searched_position = current_position + offset_in_bytes;
@@ -410,6 +410,79 @@
     return index;
 }
 
+static void
+str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
+	rb_str_t *self, long *pos,
+	UChar **utf16, long *utf16_length)
+{
+    USE_CONVERTER(cnv, src_enc);
+
+    long capa = utf16_bytesize_approximation(src_enc,
+	    self->length_in_bytes);
+    const char *source_pos = self->data.bytes + *pos;
+    const char *source_end = self->data.bytes + self->length_in_bytes;
+    UChar *buffer = xmalloc(capa);
+    UChar *target_pos = buffer;
+    UErrorCode err = U_ZERO_ERROR;
+    for (;;) {
+	UChar *target_end = buffer + BYTES_TO_UCHARS(capa);
+	err = U_ZERO_ERROR;
+	ucnv_toUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
+		NULL, true, &err);
+	if (err == U_BUFFER_OVERFLOW_ERROR) {
+	    long index = target_pos - buffer;
+	    capa *= 2; // double the buffer's size
+	    buffer = xrealloc(buffer, capa);
+	    target_pos = buffer + index;
+	}
+	else {
+	    break;
+	}
+    }
+
+    ucnv_close(cnv);
+
+    *utf16 = buffer;
+    *utf16_length = target_pos - buffer;
+    *pos = source_pos - self->data.bytes;
+
+    if (U_FAILURE(err)) {
+	// the invalid character will be skipped by str_transcode
+	*pos -= src_enc->min_char_size;
+    }
+}
+
+static void
+str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc,
+	UChar *utf16, long utf16_length, long *utf16_pos,
+	char **bytes, long *bytes_length)
+{
+    USE_CONVERTER(cnv, dst_enc);
+
+    UErrorCode err = U_ZERO_ERROR;
+    long capa = UCNV_GET_MAX_BYTES_FOR_STRING(
+	    utf16_length - *utf16_pos, ucnv_getMaxCharSize(cnv));
+    char *buffer = xmalloc(capa);
+    const UChar *source_pos = &utf16[*utf16_pos];
+    const UChar *source_end = &utf16[utf16_length];
+    char *target_pos = buffer;
+    char *target_end = buffer + capa;
+    ucnv_fromUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
+	    NULL, true, &err);
+    assert((err != U_ILLEGAL_ARGUMENT_ERROR) && (err != U_BUFFER_OVERFLOW_ERROR));
+
+    ucnv_close(cnv);
+
+    *bytes = buffer;
+    *bytes_length = target_pos - buffer;
+    *utf16_pos = source_pos - utf16;
+
+    if (U_FAILURE(err)) {
+	// the undefined character will be skipped by str_transcode
+	U16_BACK_1(utf16, 0, *utf16_pos);
+    }
+}
+
 void
 enc_init_ucnv_encoding(rb_encoding_t *encoding)
 {
@@ -440,4 +513,8 @@
 	str_ucnv_get_character_boundaries;
     encoding->methods.offset_in_bytes_to_index =
 	str_ucnv_offset_in_bytes_to_index;
+    encoding->methods.transcode_to_utf16 =
+	str_ucnv_transcode_to_utf16;
+    encoding->methods.transcode_from_utf16 =
+	str_ucnv_transcode_from_utf16;
 }
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20100508/08afe949/attachment-0001.html>


More information about the macruby-changes mailing list