Revision: 4182 http://trac.macosforge.org/projects/ruby/changeset/4182 Author: pthomson@apple.com Date: 2010-06-01 15:37:45 -0700 (Tue, 01 Jun 2010) Log Message: ----------- Add support for Encoding::Converter and move String#encode and String#encode! into the corresponding file. Modified Paths: -------------- MacRuby/trunk/encoding.c MacRuby/trunk/encoding.h MacRuby/trunk/inits.c MacRuby/trunk/rakelib/builder/builder.rb MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt MacRuby/trunk/string.c Added Paths: ----------- MacRuby/trunk/transcode.c Removed Paths: ------------- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt Modified: MacRuby/trunk/encoding.c =================================================================== --- MacRuby/trunk/encoding.c 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/encoding.c 2010-06-01 22:37:45 UTC (rev 4182) @@ -146,6 +146,32 @@ return Qfalse; } +// For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?' +rb_str_t *replacement_string_for_encoding(rb_encoding_t* destination) +{ + rb_str_t *replacement_str = NULL; + if (destination == rb_encodings[ENCODING_UTF16BE]) { + replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, destination)); + } + else if (destination == rb_encodings[ENCODING_UTF32BE]) { + replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, destination)); + } + else if (destination == rb_encodings[ENCODING_UTF16LE]) { + replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, destination)); + } + else if (destination == rb_encodings[ENCODING_UTF32LE]) { + replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, destination)); + } + else if (destination == rb_encodings[ENCODING_UTF8]) { + replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, destination)); + } + else { + replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII])); + replacement_str = str_simple_transcode(replacement_str, destination); + } + return replacement_str; +} + static void define_encoding_constant(const char *name, rb_encoding_t *encoding) { @@ -291,6 +317,7 @@ add_encoding(ENCODING_BIG5, ENCODING_TYPE_UCNV, "Big5", 1, false, true, "CP950", NULL); // FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables add_encoding(ENCODING_EUCJP, ENCODING_TYPE_UCNV, "EUC-JP", 1, false, true, "eucJP", NULL); + add_encoding(ENCODING_SJIS, ENCODING_TYPE_UCNV, "Shift_JIS", 1, false, true, "SJIS", NULL); //add_encoding(ENCODING_EUCJP, ENCODING_TYPE_RUBY, "EUC-JP", 1, false, true, "eucJP", NULL); //add_encoding(ENCODING_SJIS, ENCODING_TYPE_RUBY, "Shift_JIS", 1, false, true, "SJIS", NULL); //add_encoding(ENCODING_CP932, ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL); Modified: MacRuby/trunk/encoding.h =================================================================== --- MacRuby/trunk/encoding.h 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/encoding.h 2010-06-01 22:37:45 UTC (rev 4182) @@ -150,7 +150,7 @@ ENCODING_MACCYRILLIC, ENCODING_BIG5, ENCODING_EUCJP, - //ENCODING_SJIS, + ENCODING_SJIS, //ENCODING_CP932, ENCODINGS_COUNT @@ -295,6 +295,40 @@ STRING_VALID_ENCODING); } +typedef enum { + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, + TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING, + TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT, + TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR +} transcode_behavior_t; + +typedef enum { + ECONV_INVALID_MASK = 1, + ECONV_INVALID_REPLACE = 1 << 1, + ECONV_UNDEF_MASK = 1 << 2, + ECONV_UNDEF_REPLACE = 1 << 3, + ECONV_UNDEF_HEX_CHARREF = 1 << 4, + ECONV_PARTIAL_INPUT = 1 << 5, + ECONV_AFTER_OUTPUT = 1 << 6, + ECONV_UNIVERSAL_NEWLINE_DECORATOR = 1 << 7, + ECONV_CRLF_NEWLINE_DECORATOR = 1 << 8, + ECONV_CR_NEWLINE_DECORATOR = 1 << 9, + ECONV_XML_TEXT_DECORATOR = 1 << 10, + ECONV_XML_ATTR_CONTENT_DECORATOR = 1 << 11, + ECONV_XML_ATTR_QUOTE_DECORATOR = 1 << 12 +} transcode_flags_t; + +rb_str_t *str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding, + int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str); + +static inline rb_str_t * +str_simple_transcode(rb_str_t *self, rb_encoding_t *dst_encoding) +{ + return str_transcode(self, self->encoding, dst_encoding, + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL); +} + + void rb_str_NSCoder_encode(void *coder, VALUE str, const char *key); VALUE rb_str_NSCoder_decode(void *coder, const char *key); @@ -321,7 +355,11 @@ long rb_uchar_strtol(UniChar *chars, long chars_len, long pos, long *end_offset); void rb_str_force_encoding(VALUE str, rb_encoding_t *encoding); +rb_str_t *str_need_string(VALUE str); +rb_str_t *replacement_string_for_encoding(rb_encoding_t* enc); +void str_replace_with_string(rb_str_t *self, rb_str_t *source); + #if defined(__cplusplus) } // extern "C" #endif Modified: MacRuby/trunk/inits.c =================================================================== --- MacRuby/trunk/inits.c 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/inits.c 2010-06-01 22:37:45 UTC (rev 4182) @@ -58,6 +58,7 @@ void Init_BridgeSupport(void); void Init_FFI(void); void Init_Dispatch(void); +void Init_Transcode(void); void Init_PostVM(void); void @@ -110,5 +111,6 @@ Init_BridgeSupport(); Init_FFI(); Init_Dispatch(); + Init_Transcode(); Init_PostVM(); } Modified: MacRuby/trunk/rakelib/builder/builder.rb =================================================================== --- MacRuby/trunk/rakelib/builder/builder.rb 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/rakelib/builder/builder.rb 2010-06-01 22:37:45 UTC (rev 4182) @@ -6,7 +6,7 @@ random range rational re ruby signal sprintf st string struct time util variable version thread id objc bs ucnv encoding main dln dmyext marshal gcd vm_eval gc-stub bridgesupport compiler dispatcher vm symbol debugger MacRuby - MacRubyDebuggerConnector NSArray NSDictionary NSString + MacRubyDebuggerConnector NSArray NSDictionary NSString transcode } EXTENSIONS = %w{ Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1,7 +1,4 @@ -fails:Encoding::Converter.asciicompat_encoding accepts an encoding name as a String argument fails:Encoding::Converter.asciicompat_encoding coerces non-String/Encoding objects with #to_str fails:Encoding::Converter.asciicompat_encoding accepts an Encoding object as an argument fails:Encoding::Converter.asciicompat_encoding returns a corresponding ASCII compatible encoding for ASCII-incompatible encodings -fails:Encoding::Converter.asciicompat_encoding returns nil when the given encoding is ASCII compatible fails:Encoding::Converter.asciicompat_encoding handles encoding names who resolve to nil encodings -fails:Encoding::Converter.asciicompat_encoding returns nil if called with an encoding it returned previously Deleted: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1,26 +0,0 @@ -fails:Encoding::Converter::INVALID_MASK exists -fails:Encoding::Converter::INVALID_MASK has a Fixnum value -fails:Encoding::Converter::INVALID_REPLACE exists -fails:Encoding::Converter::INVALID_REPLACE has a Fixnum value -fails:Encoding::Converter::UNDEF_MASK exists -fails:Encoding::Converter::UNDEF_MASK has a Fixnum value -fails:Encoding::Converter::UNDEF_REPLACE exists -fails:Encoding::Converter::UNDEF_REPLACE has a Fixnum value -fails:Encoding::Converter::UNDEF_HEX_CHARREF exists -fails:Encoding::Converter::UNDEF_HEX_CHARREF has a Fixnum value -fails:Encoding::Converter::PARTIAL_INPUT exists -fails:Encoding::Converter::PARTIAL_INPUT has a Fixnum value -fails:Encoding::Converter::AFTER_OUTPUT exists -fails:Encoding::Converter::AFTER_OUTPUT has a Fixnum value -fails:Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR exists -fails:Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR has a Fixnum value -fails:Encoding::Converter::CRLF_NEWLINE_DECORATOR exists -fails:Encoding::Converter::CRLF_NEWLINE_DECORATOR has a Fixnum value -fails:Encoding::Converter::CR_NEWLINE_DECORATOR exists -fails:Encoding::Converter::CR_NEWLINE_DECORATOR has a Fixnum value -fails:Encoding::Converter::XML_TEXT_DECORATOR exists -fails:Encoding::Converter::XML_TEXT_DECORATOR has a Fixnum value -fails:Encoding::Converter::XML_ATTR_CONTENT_DECORATOR exists -fails:Encoding::Converter::XML_ATTR_CONTENT_DECORATOR has a Fixnum value -fails:Encoding::Converter::XML_ATTR_QUOTE_DECORATOR exists -fails:Encoding::Converter::XML_ATTR_QUOTE_DECORATOR has a Fixnum value Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1,7 +1,2 @@ -fails:Encoding::Converter#convert returns a String -fails:Encoding::Converter#convert sets the encoding of the result to the target encoding -fails:Encoding::Converter#convert transcodes the given String to the target encoding fails:Encoding::Converter#convert allows Strings of different encodings to the source encoding -fails:Encoding::Converter#convert reuses the given encoding pair if called multiple times -fails:Encoding::Converter#convert raises UndefinedConversionError if the String contains characters invalid for the target encoding -fails:Encoding::Converter#convert raises an ArgumentError if called on a finished stream + Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1,7 +1,2 @@ -fails:Encoding::Converter#convpath returns an Array -fails:Encoding::Converter#convpath returns each encoding pair as a sub-Array -fails:Encoding::Converter#convpath returns each encoding as an Encoding object fails:Encoding::Converter#convpath returns multiple encoding pairs when direct conversion is impossible -fails:Encoding::Converter#convpath sets the last element of each pair to the first element of the next -fails:Encoding::Converter#convpath only lists a source encoding once fails:Encoding::Converter#convpath indicates if crlf_newline conversion would occur Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1 +1 @@ -fails:Encoding::Converter#destination_encoding returns the destination encoding as an Encoding object + Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1,8 +1,3 @@ fails:Encoding::Converter#replacement returns '?' in US-ASCII when the destination encoding is not UTF-8 -fails:Encoding::Converter#replacement returns � when the destination encoding is UTF-8 -fails:Encoding::Converter#replacement= accepts a String argument -fails:Encoding::Converter#replacement= accepts a String argument of arbitrary length -fails:Encoding::Converter#replacement= raises an TypeError if assigned a non-String argument -fails:Encoding::Converter#replacement= sets #replacement fails:Encoding::Converter#replacement= raises an UndefinedConversionError is the argument cannot be converted into the destination encoding fails:Encoding::Converter#replacement= does not change the replacement character if the argument cannot be converted into the destination encoding Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1,8 +1,3 @@ -fails:Encoding::Converter.search_convpath returns an Array -fails:Encoding::Converter.search_convpath returns each encoding pair as a sub-Array -fails:Encoding::Converter.search_convpath returns each encoding as an Encoding object fails:Encoding::Converter.search_convpath returns multiple encoding pairs when direct conversion is impossible -fails:Encoding::Converter.search_convpath sets the last element of each pair to the first element of the next -fails:Encoding::Converter.search_convpath only lists a source encoding once fails:Encoding::Converter.search_convpath indicates if crlf_newline conversion would occur fails:Encoding::Converter.search_convpath raises an Encoding::ConverterNotFoundError if no conversion path exists Modified: MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt =================================================================== --- MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt 2010-06-01 22:37:45 UTC (rev 4182) @@ -1 +1 @@ -fails:Encoding::Converter#source_encoding returns the source encoding as an Encoding object + Modified: MacRuby/trunk/string.c =================================================================== --- MacRuby/trunk/string.c 2010-06-01 09:27:08 UTC (rev 4181) +++ MacRuby/trunk/string.c 2010-06-01 22:37:45 UTC (rev 4182) @@ -251,7 +251,7 @@ } } -static void +void str_replace_with_string(rb_str_t *self, rb_str_t *source) { if (self == source) { @@ -1118,7 +1118,7 @@ self->length_in_bytes, true) != -1; } -static rb_str_t * +rb_str_t * str_need_string(VALUE str) { switch (TYPE(str)) { @@ -1247,24 +1247,6 @@ } } -enum { - TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, - TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING, - TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT, - TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR -}; - - -static rb_str_t * -str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding, - int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str); -static inline rb_str_t * -str_simple_transcode(rb_str_t *self, rb_encoding_t *dst_encoding) -{ - return str_transcode(self, self->encoding, dst_encoding, - TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL); -} - static void inline str_concat_ascii_cstr(rb_str_t *self, char *cstr) { @@ -1280,7 +1262,7 @@ } } -static rb_str_t * +rb_str_t * str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding, int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str) { @@ -1844,168 +1826,9 @@ return str_is_ruby_ascii_only(RSTR(self)) ? Qtrue : Qfalse; } -/* - * call-seq: - * str.encode(encoding [, options] ) => str - * str.encode(dst_encoding, src_encoding [, options] ) => str - * str.encode([options]) => str - * - * The first form returns a copy of <i>str</i> transcoded - * to encoding +encoding+. - * The second form returns a copy of <i>str</i> transcoded - * from src_encoding to dst_encoding. - * The last form returns a copy of <i>str</i> transcoded to - * <code>Encoding.default_internal</code>. - * By default, the first and second form raise - * Encoding::UndefinedConversionError for characters that are - * undefined in the destination encoding, and - * Encoding::InvalidByteSequenceError for invalid byte sequences - * in the source encoding. The last form by default does not raise - * exceptions but uses replacement strings. - * The <code>options</code> Hash gives details for conversion. - * - * === options - * The hash <code>options</code> can have the following keys: - * :invalid :: - * If the value is <code>:replace</code>, <code>#encode</code> replaces - * invalid byte sequences in <code>str</code> with the replacement character. - * The default is to raise the exception - * :undef :: - * If the value is <code>:replace</code>, <code>#encode</code> replaces - * characters which are undefined in the destination encoding with - * the replacement character. - * :replace :: - * Sets the replacement string to the value. The default replacement - * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. - * :xml :: - * The value must be <code>:text</code> or <code>:attr</code>. - * If the value is <code>:text</code> <code>#encode</code> replaces - * undefined characters with their (upper-case hexadecimal) numeric - * character references. '&', '<', and '>' are converted to "&", - * "<", and ">", respectively. - * If the value is <code>:attr</code>, <code>#encode</code> also quotes - * the replacement result (using '"'), and replaces '"' with """. - */ -extern rb_encoding_t *default_internal; -static VALUE -rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv) -{ - VALUE opt = Qnil; - if (argc > 0) { - opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); - if (!NIL_P(opt)) { - argc--; - } - } - rb_str_t *self = RSTR(str); - rb_str_t *replacement_str = NULL; - rb_encoding_t *src_encoding, *dst_encoding; - int behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION; - int behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION; - if (argc == 0) { - src_encoding = self->encoding; - dst_encoding = default_internal; - behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; - } - else if (argc == 1) { - src_encoding = self->encoding; - dst_encoding = rb_to_encoding(argv[0]); - } - else if (argc == 2) { - dst_encoding = rb_to_encoding(argv[0]); - src_encoding = rb_to_encoding(argv[1]); - } - else { - rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc); - } - - if (!NIL_P(opt)) { - VALUE invalid_val = rb_hash_aref(opt, ID2SYM(rb_intern("invalid"))); - VALUE replace_sym = ID2SYM(rb_intern("replace")); - if (invalid_val == replace_sym) { - behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; - } - VALUE undefined_val = rb_hash_aref(opt, ID2SYM(rb_intern("undefined"))); - if (undefined_val == replace_sym) { - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; - } - VALUE xml_val = rb_hash_aref(opt, ID2SYM(rb_intern("xml"))); - if (xml_val == ID2SYM(rb_intern("text"))) { - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT; - } - else if (xml_val == ID2SYM(rb_intern("attr"))) { - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR; - } - - VALUE replacement = rb_hash_aref(opt, replace_sym); - if (!NIL_P(replacement)) { - replacement_str = str_need_string(replacement); - if ((replacement_str->encoding != dst_encoding) && (replacement_str->length_in_bytes > 0)) { - replacement_str = str_simple_transcode(replacement_str, dst_encoding); - } - if ((behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING) - && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) { - behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; - } - } - } - - if ((replacement_str == NULL) - && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING) - || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) { - if (dst_encoding == rb_encodings[ENCODING_UTF16BE]) { - replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, dst_encoding)); - } - else if (dst_encoding == rb_encodings[ENCODING_UTF32BE]) { - replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, dst_encoding)); - } - else if (dst_encoding == rb_encodings[ENCODING_UTF16LE]) { - replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, dst_encoding)); - } - else if (dst_encoding == rb_encodings[ENCODING_UTF32LE]) { - replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, dst_encoding)); - } - else if (dst_encoding == rb_encodings[ENCODING_UTF8]) { - replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, dst_encoding)); - } - else { - replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII])); - replacement_str = str_simple_transcode(replacement_str, dst_encoding); - } - } - - return (VALUE)str_transcode(self, src_encoding, dst_encoding, - behavior_for_invalid, behavior_for_undefined, replacement_str); -} - /* * call-seq: - * str.encode!(encoding [, options] ) => str - * str.encode!(dst_encoding, src_encoding [, options] ) => str - * - * The first form transcodes the contents of <i>str</i> from - * str.encoding to +encoding+. - * The second form transcodes the contents of <i>str</i> from - * src_encoding to dst_encoding. - * The options Hash gives details for conversion. See String#encode - * for details. - * Returns the string even if no changes were made. - */ -static VALUE -rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv) -{ - rstr_modify(str); - - VALUE new_str = rstr_encode(str, sel, argc, argv); - str_replace_with_string(RSTR(str), RSTR(new_str)); - return str; -} - - -/* - * call-seq: * str[fixnum] => new_str or nil * str[fixnum, fixnum] => new_str or nil * str[range] => new_str or nil @@ -5997,8 +5820,6 @@ rb_objc_define_method(rb_cRubyString, "partition", rstr_partition, 1); rb_objc_define_method(rb_cRubyString, "rpartition", rstr_rpartition, 1); rb_objc_define_method(rb_cRubyString, "crypt", rstr_crypt, 1); - rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1); - rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1); // MacRuby extensions. rb_objc_define_method(rb_cRubyString, "transform", rstr_transform, 1); Added: MacRuby/trunk/transcode.c =================================================================== --- MacRuby/trunk/transcode.c (rev 0) +++ MacRuby/trunk/transcode.c 2010-06-01 22:37:45 UTC (rev 4182) @@ -0,0 +1,450 @@ +/* + * MacRuby implementation of transcode.c. + * + * This file is covered by the Ruby license. See COPYING for more details. + * + * Copyright (C) 2007-2010, Apple Inc. All rights reserved. + * Copyright (C) 1993-2007 Yukihiro Matsumoto + * Copyright (C) 2000 Network Applied Communication Laboratory, Inc. + * Copyright (C) 2000 Information-technology Promotion Agency, Japan + */ + +// Notes: +// AFAICT, we need to add support for newline decorators. + +#include "ruby.h" +#include "ruby/encoding.h" +#include "encoding.h" + +static VALUE sym_invalid; +static VALUE sym_undef; +static VALUE sym_replace; +static VALUE sym_xml; +static VALUE sym_text; +static VALUE sym_attr; + +typedef struct rb_econv_s { + rb_encoding_t *source; + rb_encoding_t *destination; + transcode_behavior_t invalid_sequence_behavior; + transcode_behavior_t undefined_conversion_behavior; + transcode_flags_t special_flags; + rb_str_t *replacement; + bool finished; +} rb_econv_t; + +VALUE rb_cEncodingConverter; + +static rb_econv_t* RConverter(VALUE self) { + rb_econv_t *conv; + Data_Get_Struct(self, rb_econv_t, conv); + return conv; +} + +static VALUE +rb_econv_alloc(VALUE klass, SEL sel) +{ + rb_econv_t *conv = ALLOC(rb_econv_t); + conv->source = NULL; + conv->destination = NULL; + conv->replacement = NULL; + conv->special_flags = 0; + conv->finished = false; + return Data_Wrap_Struct(klass, 0, 0, conv); +} + +static VALUE +rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg) +{ + rb_encoding_t *enc = NULL; + if (CLASS_OF(arg) == rb_cEncoding) { + enc = rb_to_encoding(arg); + } + else { + StringValue(arg); + enc = rb_enc_find(RSTRING_PTR(arg)); + } + + if ((enc == NULL) || (enc->ascii_compatible)) { + return Qnil; + } + else if (UTF16_ENC(enc) || UTF32_ENC(enc)) { + return (VALUE)rb_utf8_encoding(); + } + // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones. + rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name); +} + +static VALUE rb_econv_convpath(VALUE self, SEL sel); + +static VALUE +rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv) +{ + return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel); +} + +static transcode_behavior_t +symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name) +{ + if (given_symbol == sym_replace) { + return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; + } + else if (given_symbol == sym_attr) { + return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR; + } + else if (given_symbol == sym_text) { + return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT; + } + else if (!NIL_P(given_symbol)) { + rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name); + } + return otherwise; +} + +static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid, + transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination) +{ + + *behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid), + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character"); + + *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef), + TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion"); + + // Because the API conflates the :xml and :undef options, we pass in the previous setting + *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml), + *behavior_for_undefined, "xml-replacement"); + + *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml), + *behavior_for_undefined, "xml-replacement"); + + VALUE replacement = rb_hash_aref(options, sym_replace); + if (!NIL_P(replacement)) { + *replacement_str = str_simple_transcode(str_need_string(replacement), destination); + } + +} + +static VALUE +rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv) +{ + rb_econv_t *conv = RConverter(self); + VALUE sourceobj, destobj, options; + rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options); + + rb_encoding_t* source = rb_to_encoding(sourceobj); + rb_encoding_t* destination = rb_to_encoding(destobj); + rb_str_t* replacement_str = NULL; + + conv->source = source; + conv->destination = destination; + + conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION; + conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION; + + // Extract the options. This is a hateful, hateful API. + if (!NIL_P(options)) { + + if (FIXNUM_P(options)) { + rb_bug("fixnum arguments are not supported yet."); + } + else if (TYPE(options) == T_HASH) { + parse_conversion_options(options, &conv->invalid_sequence_behavior, + &conv->undefined_conversion_behavior, &replacement_str, destination); + } + else { + rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter"); + } + } + + // Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?' + if (replacement_str == NULL) { + replacement_str = replacement_string_for_encoding(destination); + } + GC_WB(&conv->replacement, replacement_str); + + return self; +} + +static VALUE +rb_econv_inspect(VALUE self, SEL sel) +{ + // TODO: make this comply with the MRI output when we add newline decorators + rb_econv_t *conv = RConverter(self); + return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name, + conv->destination->public_name); +} + +static VALUE +rb_econv_convpath(VALUE self, SEL sel) +{ + // in MacRuby, the convpath always looks like this: + // [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]] + // The first element is omitted if the source encoding is UTF-16, obviously. + rb_econv_t *conv = RConverter(self); + VALUE to_return = rb_ary_new2(2); + rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE]; + + if (conv->source != nativeUTF16) { + rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16)); + } + + rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination)); + + return to_return; +} + +static VALUE +rb_econv_source_encoding(VALUE self, SEL sel) +{ + return (VALUE)(RConverter(self)->source); +} + +static VALUE +rb_econv_destination_encoding(VALUE self, SEL sel) +{ + return (VALUE)(RConverter(self)->destination); +} + +// Since our converter is basically a black box at this point, we'll leave +// the lower-level methods unimplemented. +#define rb_econv_primitive_convert rb_f_notimplement + +static VALUE +rb_econv_convert(VALUE self, SEL sel, VALUE str) +{ + rb_econv_t *conv; + Data_Get_Struct(self, rb_econv_t, conv); + + if (conv->finished) { + rb_raise(rb_eArgError, "convert() called on a finished stream"); + } + + assert(conv->replacement->encoding == conv->destination); + return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement); +} + +static VALUE +rb_econv_finish(VALUE self, SEL sel) +{ + // TODO: Flesh this out later. + RConverter(self)->finished = true; + return rb_str_new2(""); +} + +#define rb_econv_primitive_errinfo rb_f_notimplement + +#define rb_econv_insert_output rb_f_notimplement + +#define rb_econv_putback rb_f_notimplement + +#define rb_econv_last_error rb_f_notimplement + +static VALUE +rb_econv_replacement(VALUE self, SEL sel) +{ + return (VALUE)(RConverter(self)->replacement); +} + +static VALUE +rb_econv_set_replacement(VALUE self, SEL sel, VALUE str) +{ + // TODO: Should we copy this string? Probably. + rb_econv_t *conv = RConverter(self); + if (TYPE(str) != T_STRING) { + rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str)); + } + rb_str_force_encoding(str, conv->destination); + GC_WB(&conv->replacement, str_need_string(str)); + return str; +} + +/* + * call-seq: + * str.encode(encoding [, options] ) => str + * str.encode(dst_encoding, src_encoding [, options] ) => str + * str.encode([options]) => str + * + * The first form returns a copy of <i>str</i> transcoded + * to encoding +encoding+. + * The second form returns a copy of <i>str</i> transcoded + * from src_encoding to dst_encoding. + * The last form returns a copy of <i>str</i> transcoded to + * <code>Encoding.default_internal</code>. + * By default, the first and second form raise + * Encoding::UndefinedConversionError for characters that are + * undefined in the destination encoding, and + * Encoding::InvalidByteSequenceError for invalid byte sequences + * in the source encoding. The last form by default does not raise + * exceptions but uses replacement strings. + * The <code>options</code> Hash gives details for conversion. + * + * === options + * The hash <code>options</code> can have the following keys: + * :invalid :: + * If the value is <code>:replace</code>, <code>#encode</code> replaces + * invalid byte sequences in <code>str</code> with the replacement character. + * The default is to raise the exception + * :undef :: + * If the value is <code>:replace</code>, <code>#encode</code> replaces + * characters which are undefined in the destination encoding with + * the replacement character. + * :replace :: + * Sets the replacement string to the value. The default replacement + * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. + * :xml :: + * The value must be <code>:text</code> or <code>:attr</code>. + * If the value is <code>:text</code> <code>#encode</code> replaces + * undefined characters with their (upper-case hexadecimal) numeric + * character references. '&', '<', and '>' are converted to "&", + * "<", and ">", respectively. + * If the value is <code>:attr</code>, <code>#encode</code> also quotes + * the replacement result (using '"'), and replaces '"' with """. + */ +extern rb_encoding_t *default_internal; +static VALUE +rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv) +{ + VALUE opt = Qnil; + if (argc > 0) { + opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); + if (!NIL_P(opt)) { + argc--; + } + } + + rb_str_t *self = RSTR(str); + rb_str_t *replacement_str = NULL; + rb_encoding_t *src_encoding, *dst_encoding; + transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION; + transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION; + if (argc == 0) { + src_encoding = self->encoding; + dst_encoding = default_internal; + behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; + behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; + } + else if (argc == 1) { + src_encoding = self->encoding; + dst_encoding = rb_to_encoding(argv[0]); + } + else if (argc == 2) { + dst_encoding = rb_to_encoding(argv[0]); + src_encoding = rb_to_encoding(argv[1]); + } + else { + rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc); + } + + if (!NIL_P(opt)) { + parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding); + if ((replacement_str != NULL) + && (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING) + && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) { + behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING; + } + } + + if ((replacement_str == NULL) + && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING) + || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) { + replacement_str = replacement_string_for_encoding(dst_encoding); + } + + return (VALUE)str_transcode(self, src_encoding, dst_encoding, + behavior_for_invalid, behavior_for_undefined, replacement_str); +} + +/* + * call-seq: + * str.encode!(encoding [, options] ) => str + * str.encode!(dst_encoding, src_encoding [, options] ) => str + * + * The first form transcodes the contents of <i>str</i> from + * str.encoding to +encoding+. + * The second form transcodes the contents of <i>str</i> from + * src_encoding to dst_encoding. + * The options Hash gives details for conversion. See String#encode + * for details. + * Returns the string even if no changes were made. + */ +static VALUE +rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv) +{ + rstr_modify(str); + + VALUE new_str = rstr_encode(str, sel, argc, argv); + str_replace_with_string(RSTR(str), RSTR(new_str)); + return str; +} + +void +Init_Transcode(void) +{ + rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError); + rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError); + rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError); + + rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1); + rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1); + + rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject); + rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0); + rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1); + rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1); + + rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1); + rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0); + rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0); + rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0); + rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0); + rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1); + rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1); + rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0); + rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0); + rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1); + rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1); + rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0); + rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0); + rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1); + + sym_invalid = ID2SYM(rb_intern("invalid")); + sym_undef = ID2SYM(rb_intern("undef")); + sym_replace = ID2SYM(rb_intern("replace")); + sym_attr = ID2SYM(rb_intern("attr")); + sym_text = ID2SYM(rb_intern("text")); + sym_xml = ID2SYM(rb_intern("xml")); + + // If only these mapped to the internal enums... + rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK)); + rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE)); + rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK)); + rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE)); + rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF)); + rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); + rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT)); + rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR)); + rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR)); + rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR)); + rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR)); + rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR)); + rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR)); + +#if 0 + rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0); + rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0); + rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0); + rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0); + rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0); + + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0); + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0); + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0); + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0); + rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0); + rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0); + rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0); + + Init_newline(); +#endif +}