[macruby-changes] [4052] MacRuby/trunk
source_changes at macosforge.org
source_changes at macosforge.org
Sat May 8 22:47:19 PDT 2010
Revision: 4052
http://trac.macosforge.org/projects/ruby/changeset/4052
Author: vincent.isambart at gmail.com
Date: 2010-05-08 22:47:14 -0700 (Sat, 08 May 2010)
Log Message:
-----------
an (incomplete) implementation of String#encode
Modified Paths:
--------------
MacRuby/trunk/encoding.c
MacRuby/trunk/encoding.h
MacRuby/trunk/error.c
MacRuby/trunk/include/ruby/ruby.h
MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt
MacRuby/trunk/string.c
MacRuby/trunk/ucnv.c
Modified: MacRuby/trunk/encoding.c
===================================================================
--- MacRuby/trunk/encoding.c 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/encoding.c 2010-05-09 05:47:14 UTC (rev 4052)
@@ -17,7 +17,7 @@
VALUE rb_cEncoding;
-static rb_encoding_t *default_internal = NULL;
+rb_encoding_t *default_internal = NULL;
static rb_encoding_t *default_external = NULL;
rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
@@ -28,6 +28,8 @@
static long str_undefined_bytesize(rb_str_t *self) { abort(); }
static character_boundaries_t str_undefined_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode) { abort(); }
static long str_undefined_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode) { abort(); }
+static void str_undefined_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length) { abort(); }
+static void str_undefined_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *pos, char **bytes, long *bytes_length) { abort(); }
static VALUE
mr_enc_s_list(VALUE klass, SEL sel)
@@ -235,6 +237,10 @@
str_undefined_get_character_boundaries;
encoding->methods.offset_in_bytes_to_index =
str_undefined_offset_in_bytes_to_index;
+ encoding->methods.transcode_to_utf16 =
+ str_undefined_transcode_to_utf16;
+ encoding->methods.transcode_from_utf16 =
+ str_undefined_transcode_from_utf16;
switch (rb_encoding_type) {
case ENCODING_TYPE_SPECIAL:
Modified: MacRuby/trunk/encoding.h
===================================================================
--- MacRuby/trunk/encoding.h 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/encoding.h 2010-05-09 05:47:14 UTC (rev 4052)
@@ -116,6 +116,8 @@
long (*bytesize)(rb_str_t *);
character_boundaries_t (*get_character_boundaries)(rb_str_t *, long, bool);
long (*offset_in_bytes_to_index)(rb_str_t *, long, bool);
+ void (*transcode_to_utf16)(struct rb_encoding *, rb_str_t *, long *, UChar **, long *);
+ void (*transcode_from_utf16)(struct rb_encoding *, UChar *, long, long *, char **, long *);
} encoding_methods_t;
typedef struct rb_encoding {
Modified: MacRuby/trunk/error.c
===================================================================
--- MacRuby/trunk/error.c 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/error.c 2010-05-09 05:47:14 UTC (rev 4052)
@@ -342,6 +342,9 @@
VALUE rb_eNameError;
VALUE rb_eEncodingError;
VALUE rb_eEncCompatError;
+VALUE rb_eUndefinedConversionError;
+VALUE rb_eInvalidByteSequenceError;
+VALUE rb_eConverterNotFoundError;
VALUE rb_eNoMethodError;
VALUE rb_eSecurityError;
VALUE rb_eNotImpError;
@@ -1094,6 +1097,9 @@
rb_eNoMemError = rb_define_class("NoMemoryError", rb_eException);
rb_eEncodingError = rb_define_class("EncodingError", rb_eStandardError);
rb_eEncCompatError = rb_define_class_under(rb_cEncoding, "CompatibilityError", rb_eEncodingError);
+ rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
+ rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
+ rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
syserr_tbl = st_init_numtable();
GC_ROOT(&syserr_tbl);
Modified: MacRuby/trunk/include/ruby/ruby.h
===================================================================
--- MacRuby/trunk/include/ruby/ruby.h 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/include/ruby/ruby.h 2010-05-09 05:47:14 UTC (rev 4052)
@@ -959,6 +959,9 @@
RUBY_EXTERN VALUE rb_eRegexpError;
RUBY_EXTERN VALUE rb_eEncodingError;
RUBY_EXTERN VALUE rb_eEncCompatError;
+RUBY_EXTERN VALUE rb_eUndefinedConversionError;
+RUBY_EXTERN VALUE rb_eInvalidByteSequenceError;
+RUBY_EXTERN VALUE rb_eConverterNotFoundError;
RUBY_EXTERN VALUE rb_eScriptError;
RUBY_EXTERN VALUE rb_eNameError;
Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt
===================================================================
--- MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/spec/frozen/tags/macruby/core/string/encode_tags.txt 2010-05-09 05:47:14 UTC (rev 4052)
@@ -1,30 +1,19 @@
fails:String#encode! transcodes to the default internal encoding with no argument
-fails:String#encode! returns self when called with only a target encoding
-fails:String#encode! tags the String with the given encoding
fails:String#encode! transcodes self to the given encoding
fails:String#encode! can convert between encodings where a multi-stage conversion path is needed
fails:String#encode! raises an Encoding::InvalidByteSequenceError for invalid byte sequences
fails:String#encode! raises UndefinedConversionError if the String contains characters invalid for the target encoding
fails:String#encode! raises Encoding::ConverterNotFoundError for invalid target encodings
-fails:String#encode! raises a RuntimeError when called on a frozen String
-fails:String#encode! raises a RuntimeError when called on a frozen String when it's a no-op
fails:String#encode transcodes to the default internal encoding with no argument
-fails:String#encode returns a copy of self when called with only a target encoding
fails:String#encode returns self when called with only a target encoding
-fails:String#encode returns a copy of self even when no changes are made
-fails:String#encode returns a String with the given encoding
fails:String#encode transcodes self to the given encoding
fails:String#encode can convert between encodings where a multi-stage conversion path is needed
fails:String#encode raises an Encoding::InvalidByteSequenceError for invalid byte sequences
fails:String#encode raises UndefinedConversionError if the String contains characters invalid for the target encoding
fails:String#encode raises Encoding::ConverterNotFoundError for invalid target encodings
-fails:String#encode! replaces invalid characters
fails:String#encode! replaces undefined characters
fails:String#encode! replaces xml characters
fails:String#encode! replaces xml characters and quotes the result
-fails:String#encode replaces invalid characters
fails:String#encode replaces undefined characters
fails:String#encode replaces xml characters
fails:String#encode replaces xml characters and quotes the result
-fails:String#encode with no arguments returns a copy of self transcoded to Encoding.default_internal
-fails:String#encode with no arguments returns a copy of self
Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/string.c 2010-05-09 05:47:14 UTC (rev 4052)
@@ -1242,6 +1242,141 @@
}
}
+enum {
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+ TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING,
+ TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT,
+ TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR
+};
+static rb_str_t *
+str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
+ int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str)
+{
+ if ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+ || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)) {
+ assert(replacement_str != NULL);
+ assert(replacement_str->encoding != NULL);
+ assert(replacement_str->encoding == dst_encoding);
+ }
+
+ rb_str_t *dst_str = str_alloc(rb_cRubyString);
+ dst_str->encoding = dst_encoding;
+
+ if (self->length_in_bytes == 0) {
+ return dst_str;
+ }
+
+ if (src_encoding == self->encoding) {
+ // if the string can already be converted in UTF-16, half the job is done
+ str_try_making_data_uchars(self);
+ }
+ else {
+ // if the source encoding is not the string encoding
+ // we must be sure to start from the bytes, not UTF-16
+ str_make_data_binary(self);
+ }
+
+ rb_encoding_t *src_encoding_used;
+ rb_encoding_t *dst_encoding_used;
+ if (BINARY_ENC(dst_encoding)) {
+ dst_encoding_used = rb_encodings[ENCODING_ASCII];
+ }
+ else {
+ dst_encoding_used = dst_encoding;
+ }
+ if (BINARY_ENC(src_encoding)) {
+ src_encoding_used = rb_encodings[ENCODING_ASCII];
+ }
+ else {
+ src_encoding_used = src_encoding;
+ }
+
+ long pos_in_src = 0;
+ for (;;) {
+ UChar *utf16;
+ long utf16_length;
+ // if the encoding is native UTF-16 it's always stored in UChars
+ // but it can contain invalid bytes
+ if (str_is_stored_in_uchars(self) && !NATIVE_UTF16_ENC(self->encoding)) {
+ utf16 = self->data.uchars;
+ utf16_length = BYTES_TO_UCHARS(self->length_in_bytes);
+ pos_in_src = self->length_in_bytes;
+ }
+ else {
+ src_encoding_used->methods.transcode_to_utf16(src_encoding_used,
+ self, &pos_in_src, &utf16, &utf16_length);
+ }
+
+ if (utf16_length > 0) {
+ long utf16_pos = 0;
+ for (;;) {
+ long bytes_length;
+ char *bytes;
+ dst_encoding_used->methods.transcode_from_utf16(dst_encoding_used,
+ utf16, utf16_length, &utf16_pos, &bytes, &bytes_length);
+ if (bytes_length > 0) {
+ str_concat_bytes(dst_str, bytes, bytes_length);
+ }
+ if (utf16_pos < utf16_length) {
+ // undefined char
+ UChar32 c;
+ U16_NEXT(utf16, utf16_pos, utf16_length, c);
+ switch (behavior_for_undefined) {
+ case TRANSCODE_BEHAVIOR_RAISE_EXCEPTION:
+ rb_raise(rb_eUndefinedConversionError, "U+%04X from %s to %s", c, src_encoding->public_name, dst_encoding->public_name);
+ break;
+ case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
+ str_concat_bytes(dst_str, replacement_str->data.bytes, replacement_str->length_in_bytes);
+ break;
+ case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT:
+ case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR:
+ break;
+ default:
+ abort();
+ }
+ }
+ if (utf16_pos == utf16_length) {
+ break;
+ }
+ }
+ }
+
+ if (pos_in_src < self->length_in_bytes) {
+ // invalid bytes
+ long invalid_bytes_length = src_encoding->min_char_size;
+ if (invalid_bytes_length + pos_in_src > self->length_in_bytes) {
+ invalid_bytes_length = self->length_in_bytes - pos_in_src;
+ }
+ switch (behavior_for_invalid) {
+ case TRANSCODE_BEHAVIOR_RAISE_EXCEPTION:
+ {
+ char *bytes_list = xmalloc(invalid_bytes_length * 4);
+ char *bytes_list_pos = bytes_list;
+ for (long i = 0; i < invalid_bytes_length; ++i) {
+ sprintf(bytes_list_pos, "\\x%02X", (unsigned char)self->data.bytes[pos_in_src+i]);
+ bytes_list_pos += 4;
+ }
+ rb_raise(rb_eInvalidByteSequenceError, "\"%s\" on %s", bytes_list, src_encoding->public_name);
+ }
+ break;
+ case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
+ str_concat_bytes(dst_str, replacement_str->data.bytes, replacement_str->length_in_bytes);
+ break;
+ default:
+ abort();
+ }
+ pos_in_src += invalid_bytes_length;
+ }
+
+ if (pos_in_src == self->length_in_bytes) {
+ break;
+ }
+ }
+
+ return dst_str;
+}
+
+
//----------------------------------------------
// Functions called by MacRuby
@@ -1586,6 +1721,170 @@
/*
* call-seq:
+ * str.encode(encoding [, options] ) => str
+ * str.encode(dst_encoding, src_encoding [, options] ) => str
+ * str.encode([options]) => str
+ *
+ * The first form returns a copy of <i>str</i> transcoded
+ * to encoding +encoding+.
+ * The second form returns a copy of <i>str</i> transcoded
+ * from src_encoding to dst_encoding.
+ * The last form returns a copy of <i>str</i> transcoded to
+ * <code>Encoding.default_internal</code>.
+ * By default, the first and second form raise
+ * Encoding::UndefinedConversionError for characters that are
+ * undefined in the destination encoding, and
+ * Encoding::InvalidByteSequenceError for invalid byte sequences
+ * in the source encoding. The last form by default does not raise
+ * exceptions but uses replacement strings.
+ * The <code>options</code> Hash gives details for conversion.
+ *
+ * === options
+ * The hash <code>options</code> can have the following keys:
+ * :invalid ::
+ * If the value is <code>:replace</code>, <code>#encode</code> replaces
+ * invalid byte sequences in <code>str</code> with the replacement character.
+ * The default is to raise the exception
+ * :undef ::
+ * If the value is <code>:replace</code>, <code>#encode</code> replaces
+ * characters which are undefined in the destination encoding with
+ * the replacement character.
+ * :replace ::
+ * Sets the replacement string to the value. The default replacement
+ * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
+ * :xml ::
+ * The value must be <code>:text</code> or <code>:attr</code>.
+ * If the value is <code>:text</code> <code>#encode</code> replaces
+ * undefined characters with their (upper-case hexadecimal) numeric
+ * character references. '&', '<', and '>' are converted to "&",
+ * "<", and ">", respectively.
+ * If the value is <code>:attr</code>, <code>#encode</code> also quotes
+ * the replacement result (using '"'), and replaces '"' with """.
+ */
+extern rb_encoding_t *default_internal;
+static VALUE
+rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+ VALUE opt = Qnil;
+ if (argc > 0) {
+ opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
+ if (!NIL_P(opt)) {
+ argc--;
+ }
+ }
+
+ rb_str_t *self = RSTR(str);
+ rb_str_t *replacement_str = NULL;
+ rb_encoding_t *src_encoding, *dst_encoding;
+ int behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+ int behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+ if (argc == 0) {
+ src_encoding = self->encoding;
+ dst_encoding = default_internal;
+ behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ else if (argc == 1) {
+ src_encoding = self->encoding;
+ dst_encoding = rb_to_encoding(argv[0]);
+ }
+ else if (argc == 2) {
+ dst_encoding = rb_to_encoding(argv[0]);
+ src_encoding = rb_to_encoding(argv[1]);
+ }
+ else {
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
+ }
+
+ if (!NIL_P(opt)) {
+ VALUE invalid_val = rb_hash_aref(opt, ID2SYM(rb_intern("invalid")));
+ VALUE replace_sym = ID2SYM(rb_intern("replace"));
+ if (invalid_val == replace_sym) {
+ behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ VALUE undefined_val = rb_hash_aref(opt, ID2SYM(rb_intern("undefined")));
+ if (undefined_val == replace_sym) {
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ VALUE xml_val = rb_hash_aref(opt, ID2SYM(rb_intern("xml")));
+ if (xml_val == ID2SYM(rb_intern("text"))) {
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
+ }
+ else if (xml_val == ID2SYM(rb_intern("attr"))) {
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
+ }
+
+ VALUE replacement = rb_hash_aref(opt, replace_sym);
+ if (!NIL_P(replacement)) {
+ replacement_str = str_need_string(replacement);
+ if (replacement_str->encoding != dst_encoding) {
+ replacement_str = str_transcode(replacement_str, replacement_str->encoding,
+ dst_encoding, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
+ }
+ if ((behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+ && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ }
+ }
+
+ if ((replacement_str == NULL)
+ && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+ || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
+ if (dst_encoding == rb_encodings[ENCODING_UTF16BE]) {
+ replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, dst_encoding));
+ }
+ else if (dst_encoding == rb_encodings[ENCODING_UTF32BE]) {
+ replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, dst_encoding));
+ }
+ else if (dst_encoding == rb_encodings[ENCODING_UTF16LE]) {
+ replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, dst_encoding));
+ }
+ else if (dst_encoding == rb_encodings[ENCODING_UTF32LE]) {
+ replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, dst_encoding));
+ }
+ else if (dst_encoding == rb_encodings[ENCODING_UTF8]) {
+ replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, dst_encoding));
+ }
+ else {
+ replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
+ replacement_str = str_transcode(replacement_str, replacement_str->encoding,
+ dst_encoding, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
+ }
+ }
+
+ return (VALUE)str_transcode(self, src_encoding, dst_encoding,
+ behavior_for_invalid, behavior_for_undefined, replacement_str);
+}
+
+/*
+ * call-seq:
+ * str.encode!(encoding [, options] ) => str
+ * str.encode!(dst_encoding, src_encoding [, options] ) => str
+ *
+ * The first form transcodes the contents of <i>str</i> from
+ * str.encoding to +encoding+.
+ * The second form transcodes the contents of <i>str</i> from
+ * src_encoding to dst_encoding.
+ * The options Hash gives details for conversion. See String#encode
+ * for details.
+ * Returns the string even if no changes were made.
+ */
+static VALUE
+rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+ rstr_modify(str);
+
+ VALUE new_str = rstr_encode(str, sel, argc, argv);
+ str_replace_with_string(RSTR(str), RSTR(new_str));
+ return str;
+}
+
+
+/*
+ * call-seq:
* str[fixnum] => new_str or nil
* str[fixnum, fixnum] => new_str or nil
* str[range] => new_str or nil
@@ -5533,6 +5832,8 @@
rb_objc_define_method(rb_cRubyString, "partition", rstr_partition, 1);
rb_objc_define_method(rb_cRubyString, "rpartition", rstr_rpartition, 1);
rb_objc_define_method(rb_cRubyString, "crypt", rstr_crypt, 1);
+ rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
+ rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
// MacRuby extensions.
rb_objc_define_method(rb_cRubyString, "transform", rstr_transform, 1);
Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c 2010-05-08 03:53:04 UTC (rev 4051)
+++ MacRuby/trunk/ucnv.c 2010-05-09 05:47:14 UTC (rev 4052)
@@ -15,13 +15,13 @@
// do not forget to close the converter
// before leaving the function
-#define USE_CONVERTER(cnv, str) \
- assert(str->encoding->private_data != NULL); \
+#define USE_CONVERTER(cnv, encoding) \
+ assert(encoding->private_data != NULL); \
char cnv##_buffer[U_CNV_SAFECLONE_BUFFERSIZE]; \
UErrorCode cnv##_err = U_ZERO_ERROR; \
int32_t cnv##_buffer_size = U_CNV_SAFECLONE_BUFFERSIZE; \
UConverter *cnv = ucnv_safeClone( \
- (UConverter *)str->encoding->private_data, \
+ (UConverter *)encoding->private_data, \
cnv##_buffer, \
&cnv##_buffer_size, \
&cnv##_err \
@@ -33,7 +33,7 @@
{
assert(!str_is_stored_in_uchars(self));
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
bool ascii_only = true;
bool valid_encoding = true;
@@ -78,7 +78,7 @@
{
assert(str_is_stored_in_uchars(self));
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
UErrorCode err = U_ZERO_ERROR;
long capa = UCNV_GET_MAX_BYTES_FOR_STRING(BYTES_TO_UCHARS(
@@ -134,7 +134,7 @@
{
assert(!str_is_stored_in_uchars(self));
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
long capa = utf16_bytesize_approximation(self->encoding,
self->length_in_bytes);
@@ -180,7 +180,7 @@
{
assert(!str_is_stored_in_uchars(self));
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
const char *pos = self->data.bytes;
const char *end = pos + self->length_in_bytes;
@@ -227,7 +227,7 @@
// for strings stored in UTF-16 for which the Ruby encoding is not UTF-16,
// we have to convert back the string in its original encoding to get the
// length in bytes
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
UErrorCode err = U_ZERO_ERROR;
@@ -270,7 +270,7 @@
}
// the code has many similarities with str_length
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
const char *pos = self->data.bytes;
const char *end = pos + self->length_in_bytes;
@@ -360,7 +360,7 @@
assert(!str_is_stored_in_uchars(self));
// the code has many similarities with str_length
- USE_CONVERTER(cnv, self);
+ USE_CONVERTER(cnv, self->encoding);
const char *current_position = self->data.bytes;
const char *searched_position = current_position + offset_in_bytes;
@@ -410,6 +410,79 @@
return index;
}
+static void
+str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
+ rb_str_t *self, long *pos,
+ UChar **utf16, long *utf16_length)
+{
+ USE_CONVERTER(cnv, src_enc);
+
+ long capa = utf16_bytesize_approximation(src_enc,
+ self->length_in_bytes);
+ const char *source_pos = self->data.bytes + *pos;
+ const char *source_end = self->data.bytes + self->length_in_bytes;
+ UChar *buffer = xmalloc(capa);
+ UChar *target_pos = buffer;
+ UErrorCode err = U_ZERO_ERROR;
+ for (;;) {
+ UChar *target_end = buffer + BYTES_TO_UCHARS(capa);
+ err = U_ZERO_ERROR;
+ ucnv_toUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
+ NULL, true, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ long index = target_pos - buffer;
+ capa *= 2; // double the buffer's size
+ buffer = xrealloc(buffer, capa);
+ target_pos = buffer + index;
+ }
+ else {
+ break;
+ }
+ }
+
+ ucnv_close(cnv);
+
+ *utf16 = buffer;
+ *utf16_length = target_pos - buffer;
+ *pos = source_pos - self->data.bytes;
+
+ if (U_FAILURE(err)) {
+ // the invalid character will be skipped by str_transcode
+ *pos -= src_enc->min_char_size;
+ }
+}
+
+static void
+str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc,
+ UChar *utf16, long utf16_length, long *utf16_pos,
+ char **bytes, long *bytes_length)
+{
+ USE_CONVERTER(cnv, dst_enc);
+
+ UErrorCode err = U_ZERO_ERROR;
+ long capa = UCNV_GET_MAX_BYTES_FOR_STRING(
+ utf16_length - *utf16_pos, ucnv_getMaxCharSize(cnv));
+ char *buffer = xmalloc(capa);
+ const UChar *source_pos = &utf16[*utf16_pos];
+ const UChar *source_end = &utf16[utf16_length];
+ char *target_pos = buffer;
+ char *target_end = buffer + capa;
+ ucnv_fromUnicode(cnv, &target_pos, target_end, &source_pos, source_end,
+ NULL, true, &err);
+ assert((err != U_ILLEGAL_ARGUMENT_ERROR) && (err != U_BUFFER_OVERFLOW_ERROR));
+
+ ucnv_close(cnv);
+
+ *bytes = buffer;
+ *bytes_length = target_pos - buffer;
+ *utf16_pos = source_pos - utf16;
+
+ if (U_FAILURE(err)) {
+ // the undefined character will be skipped by str_transcode
+ U16_BACK_1(utf16, 0, *utf16_pos);
+ }
+}
+
void
enc_init_ucnv_encoding(rb_encoding_t *encoding)
{
@@ -440,4 +513,8 @@
str_ucnv_get_character_boundaries;
encoding->methods.offset_in_bytes_to_index =
str_ucnv_offset_in_bytes_to_index;
+ encoding->methods.transcode_to_utf16 =
+ str_ucnv_transcode_to_utf16;
+ encoding->methods.transcode_from_utf16 =
+ str_ucnv_transcode_from_utf16;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20100508/08afe949/attachment-0001.html>
More information about the macruby-changes
mailing list