[macruby-changes] [3581] MacRuby/branches/icu

Fri Feb 19 20:23:20 PST 2010

Revision: 3581
          http://trac.macosforge.org/projects/ruby/changeset/3581
Author:   lsansonetti at apple.com
Date:     2010-02-19 20:23:17 -0800 (Fri, 19 Feb 2010)
Log Message:
-----------
some work on string

Modified Paths:
--------------
    MacRuby/branches/icu/encoding.c
    MacRuby/branches/icu/encoding.h
    MacRuby/branches/icu/parse.y
    MacRuby/branches/icu/ruby.c
    MacRuby/branches/icu/string.c
    MacRuby/branches/icu/ucnv.c

Modified: MacRuby/branches/icu/encoding.c
===================================================================

--- MacRuby/branches/icu/encoding.c	2010-02-19 23:08:37 UTC (rev 3580)
+++ MacRuby/branches/icu/encoding.c	2010-02-20 04:23:17 UTC (rev 3581)
@@ -9,9 +9,12 @@
  * Copyright (C) 2000 Information-technology Promotion Agency, Japan
  */
 
-#include "encoding.h"
 #include <string.h>
 
+#include "ruby.h"
+#include "ruby/encoding.h"
+#include "encoding.h"
+
 VALUE rb_cEncoding;
 
 static rb_encoding_t *default_internal = NULL;
@@ -241,9 +244,6 @@
     default_internal = rb_encodings[ENCODING_UTF8];
 }
 
-VALUE
-mr_enc_s_is_compatible(VALUE klass, SEL sel, VALUE str1, VALUE str2);
-
 void
 Init_Encoding(void)
 {
@@ -266,10 +266,8 @@
     rb_objc_define_method(CLASS_OF(rb_cEncoding), "aliases",
 	    mr_enc_s_aliases, 0);
     //rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
-    // it's defined on Encoding, but it requires String's internals so it's
-    // defined with String
     rb_objc_define_method(CLASS_OF(rb_cEncoding), "compatible?",
-	    mr_enc_s_is_compatible, 2);
+	    mr_enc_s_is_compatible, 2); // in string.c
 
     //rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
     //rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);

Modified: MacRuby/branches/icu/encoding.h
===================================================================
--- MacRuby/branches/icu/encoding.h	2010-02-19 23:08:37 UTC (rev 3580)
+++ MacRuby/branches/icu/encoding.h	2010-02-20 04:23:17 UTC (rev 3581)
@@ -16,8 +16,6 @@
 extern "C" {
 #endif
 
-#include "ruby.h"
-
 #if defined(__cplusplus)
 # include "unicode/unistr.h"
 #else
@@ -84,6 +82,18 @@
 
 #define IS_RSTR(x) (rb_klass_is_rstr(*(VALUE *)x))
 
+static inline void
+rstr_modify(VALUE str)
+{
+    const long mask = RBASIC(str)->flags;
+    if ((mask & FL_FREEZE) == FL_FREEZE) {
+        rb_raise(rb_eRuntimeError, "can't modify frozen/immutable string");
+    }
+    if ((mask & FL_TAINT) == FL_TAINT && rb_safe_level() >= 4) {
+        rb_raise(rb_eSecurityError, "Insecure: can't modify string");
+    }
+}
+
 typedef struct {
     long start_offset_in_bytes;
     long end_offset_in_bytes;
@@ -276,6 +286,8 @@
 	bool *need_free_p);
 long rb_str_chars_len(VALUE str);
 
+VALUE mr_enc_s_is_compatible(VALUE klass, SEL sel, VALUE str1, VALUE str2);
+
 // Return a string object appropriate for bstr_ calls. This does nothing for
 // data/binary RubyStrings.
 VALUE rb_str_bstr(VALUE str);

Modified: MacRuby/branches/icu/parse.y
===================================================================
--- MacRuby/branches/icu/parse.y	2010-02-19 23:08:37 UTC (rev 3580)
+++ MacRuby/branches/icu/parse.y	2010-02-20 04:23:17 UTC (rev 3581)
@@ -5067,36 +5067,34 @@
 }
 #endif /* !RIPPER */
 
+struct lex_get_str_context {
+    VALUE str;
+    UChar *chars;
+    long chars_len;
+};
+
 static VALUE
 lex_get_str(struct parser_params *parser, VALUE s)
 {
-    long beg = 0, len; 
-    const long n = CFStringGetLength((CFStringRef)s);
+    struct lex_get_str_context *ctx = (struct lex_get_str_context *)s;
+
+    long beg = 0;
     if (lex_gets_ptr > 0) {
-	if (n == lex_gets_ptr) {
+	if (ctx->chars_len == lex_gets_ptr) {
 	    return Qnil;
 	}
 	beg += lex_gets_ptr;
     }
 
-    CFRange search_range;  
-    if (CFStringFindCharacterFromSet((CFStringRef)s, 
-		CFCharacterSetGetPredefined(kCFCharacterSetNewline),
-		CFRangeMake(beg, n - beg),
-		0,
-		&search_range)) {
-	lex_gets_ptr = search_range.location + 1;
-	len = search_range.location - beg;
+    lex_gets_ptr = ctx->chars_len;
+    for (long i = beg; i < ctx->chars_len; i++) {
+	if (ctx->chars[i] == '\n') {
+	    lex_gets_ptr = i + 1;
+	    break;
+	}
     }
-    else {
-	lex_gets_ptr = n;
-	len = lex_gets_ptr - beg;	
-    }
 
-    CFStringRef subs = CFStringCreateWithSubstring(NULL, (CFStringRef)s, 
-	    CFRangeMake(beg, lex_gets_ptr - beg));
-    CFMakeCollectable(subs);
-    return (VALUE)subs;
+    return rb_unicode_str_new(&ctx->chars[beg], lex_gets_ptr - beg);
 }
 
 static VALUE
@@ -5118,15 +5116,27 @@
     return rb_parser_compile_string(rb_parser_new(), f, s, line);
 }
 
-NODE*
+NODE *
 rb_parser_compile_string(VALUE vparser, const char *f, VALUE s, int line)
 {
     struct parser_params *parser;
     Data_Get_Struct(vparser, struct parser_params, parser);
 
+    UChar *chars = NULL;
+    long chars_len = 0;
+    bool need_free = false;
+    rb_str_get_uchars(s, &chars, &chars_len, &need_free);
+    assert(!need_free);
+
+    struct lex_get_str_context *ctx = (struct lex_get_str_context *)
+	xmalloc(sizeof(struct lex_get_str_context));
+    GC_WB(&ctx->str, s);
+    ctx->chars = chars;
+    ctx->chars_len = chars_len;
+
     lex_gets = lex_get_str;
     lex_gets_ptr = 0;
-    GC_WB(&lex_input, s);
+    GC_WB(&lex_input, ctx);
     lex_pbeg = lex_p = lex_pend = 0;
     compile_for_eval = rb_parse_in_eval();
 

Modified: MacRuby/branches/icu/ruby.c
===================================================================
--- MacRuby/branches/icu/ruby.c	2010-02-19 23:08:37 UTC (rev 3580)
+++ MacRuby/branches/icu/ruby.c	2010-02-20 04:23:17 UTC (rev 3581)
@@ -553,17 +553,19 @@
 
 	  case 'e':
 	    forbid_setid("-e");
-	    if (!*++s) {
+	    if (*++s == '\0') {
 		s = argv[1];
-		argc--, argv++;
+		argc--;
+		argv++;
 	    }
-	    if (!s) {
+	    if (s == NULL) {
 		rb_raise(rb_eRuntimeError, "no code specified for -e");
 	    }
-	    if (!opt->e_script) {
-		opt->e_script = rb_str_new(0, 0);
-		if (opt->script == 0)
+	    if (opt->e_script == 0) {
+		opt->e_script = rb_str_new(NULL, 0);
+		if (opt->script == NULL) {
 		    opt->script = "-e";
+		}
 	    }
 	    rb_str_cat2(opt->e_script, s);
 	    rb_str_cat2(opt->e_script, "\n");
@@ -571,12 +573,13 @@
 
 	  case 'r':
 	    forbid_setid("-r");
-	    if (*++s) {
+	    if (*++s != '\0') {
 		add_modules(s);
 	    }
 	    else if (argv[1]) {
 		add_modules(argv[1]);
-		argc--, argv++;
+		argc--;
+		argv++;
 	    }
 	    break;
 

Modified: MacRuby/branches/icu/string.c
===================================================================
--- MacRuby/branches/icu/string.c	2010-02-19 23:08:37 UTC (rev 3580)
+++ MacRuby/branches/icu/string.c	2010-02-20 04:23:17 UTC (rev 3581)
@@ -1,5 +1,5 @@
 /* 
- * MacRuby implementation of Ruby 1.9 String.
+ * MacRuby Strings.
  *
  * This file is covered by the Ruby license. See COPYING for more details.
  * 
@@ -11,7 +11,10 @@
 
 #include <stdio.h>
 #include <stdarg.h>
+#include <wctype.h>
 
+#include "ruby.h"
+#include "ruby/encoding.h"
 #include "encoding.h"
 #include "objc.h"
 #include "id.h"
@@ -26,6 +29,8 @@
 
 VALUE rb_fs;
 
+// rb_str_t primitives.
+
 static void
 str_update_flags_utf16(rb_str_t *self)
 {
@@ -215,11 +220,12 @@
 
     self->flags = 0;
     self->encoding = enc;
-    self->capacity_in_bytes = self->length_in_bytes = len;
+    self->capacity_in_bytes = len;
     if (len > 0) {
 	GC_WB(&self->data.bytes, xmalloc(len));
 	if (bytes != NULL) {
 	    memcpy(self->data.bytes, bytes, len);
+	    self->length_in_bytes = len;
 	}
     }
     else {
@@ -239,14 +245,39 @@
 }
 
 static void
-str_replace_with_unichars(rb_str_t *self, const UniChar *chars, long len)
+str_append_uchar(rb_str_t *self, UChar c)
 {
+    assert(str_is_stored_in_uchars(self));
+    const long uchar_cap = BYTES_TO_UCHARS(self->capacity_in_bytes);
+    const long uchar_len = BYTES_TO_UCHARS(self->length_in_bytes);
+    if (uchar_len + 1 >= uchar_cap) {
+	assert(uchar_len + 1 < uchar_cap + 10);
+	self->capacity_in_bytes += UCHARS_TO_BYTES(10);
+	UChar *uchars = (UChar *)xrealloc(self->data.uchars,
+		self->capacity_in_bytes);
+	if (uchars != self->data.uchars) {
+	    GC_WB(&self->data.uchars, uchars);
+	}
+    }
+    self->data.uchars[uchar_len] = c;
+    self->length_in_bytes += UCHARS_TO_BYTES(1);
+}
+
+static void
+str_replace_with_uchars(rb_str_t *self, const UChar *chars, long len)
+{
+    assert(len >= 0);
+
+    len = UCHARS_TO_BYTES(len);
     self->flags = 0;
     self->encoding = rb_encodings[ENCODING_UTF8];
-    self->capacity_in_bytes = self->length_in_bytes = UCHARS_TO_BYTES(len);
-    if (self->length_in_bytes != 0) {
-	GC_WB(&self->data.uchars, xmalloc(self->length_in_bytes));
-	memcpy(self->data.uchars, chars, self->length_in_bytes);
+    self->capacity_in_bytes = len;
+    if (len > 0) {
+	GC_WB(&self->data.uchars, xmalloc(len));
+	if (chars != NULL) {
+	    memcpy(self->data.uchars, chars, len);
+	    self->length_in_bytes = len;
+	}
 	str_set_stored_in_uchars(self, true);
     }
 }
@@ -260,47 +291,31 @@
 	chars = (UniChar *)malloc(sizeof(UniChar) * len);
 	CFStringGetCharacters(source, CFRangeMake(0, len), chars);
     }
-    str_replace_with_unichars(self, chars, len);
+    str_replace_with_uchars(self, chars, len);
 }
 
 static void
 str_replace(rb_str_t *self, VALUE arg)
 {
-    if (!SPECIAL_CONST_P(arg) && IS_RSTR(arg)) {
-	str_replace_with_string(self, RSTR(arg));
-    }
-    else {
-	switch (TYPE(arg)) {
-	    case T_STRING:
+    switch (TYPE(arg)) {
+	case T_STRING:
+	    if (IS_RSTR(arg)) {
+		str_replace_with_string(self, RSTR(arg));
+	    }
+	    else {
 		str_replace_with_cfstring(self, (CFStringRef)arg);
-		break;
-	    case T_SYMBOL:
-		abort(); // TODO
-	    default:
-		str_replace(self, rb_str_to_str(arg));
-		break;
-	}
+	    }
+	    break;
+	default:
+	    str_replace(self, rb_str_to_str(arg));
+	    break;
     }
 }
 
 static rb_str_t *
-str_dup(VALUE source)
+str_dup(rb_str_t *source)
 {
     rb_str_t *destination = str_alloc(rb_cRubyString);
-    str_replace(destination, source);
-    return destination;
-}
-
-static void
-str_clear(rb_str_t *self)
-{
-    self->length_in_bytes = 0;
-}
-
-static rb_str_t *
-str_new_from_string(rb_str_t *source)
-{
-    rb_str_t *destination = str_alloc(rb_cRubyString);
     str_replace_with_string(destination, source);
     return destination;
 }
@@ -426,81 +441,6 @@
     }
 }
 
-static bool
-str_getbyte(rb_str_t *self, long index, unsigned char *c)
-{
-    if (str_is_stored_in_uchars(self) && NATIVE_UTF16_ENC(self->encoding)) {
-	if (index < 0) {
-	    index += self->length_in_bytes;
-	    if (index < 0) {
-		return false;
-	    }
-	}
-	if (index >= self->length_in_bytes) {
-	    return false;
-	}
-	if (NATIVE_UTF16_ENC(self->encoding)) {
-	    *c = self->data.bytes[index];
-	}
-	else { // non native byte-order UTF-16
-	    if ((index & 1) == 0) { // even
-		*c = self->data.bytes[index+1];
-	    }
-	    else { // odd
-		*c = self->data.bytes[index-1];
-	    }
-	}
-    }
-    else {
-	// work with a binary string
-	// (UTF-16 strings could be converted to their binary form
-	//  on the fly but that would just add complexity)
-	str_make_data_binary(self);
-
-	if (index < 0) {
-	    index += self->length_in_bytes;
-	    if (index < 0) {
-		return false;
-	    }
-	}
-	if (index >= self->length_in_bytes) {
-	    return false;
-	}
-	*c = self->data.bytes[index];
-    }
-    return true;
-}
-
-static void
-str_setbyte(rb_str_t *self, long index, unsigned char value)
-{
-    str_make_data_binary(self);
-    if ((index < -self->length_in_bytes) || (index >= self->length_in_bytes)) {
-	rb_raise(rb_eIndexError, "index %ld out of string", index);
-    }
-    if (index < 0) {
-	index += self->length_in_bytes;
-    }
-    self->data.bytes[index] = value;
-}
-
-static void
-str_force_encoding(rb_str_t *self, rb_encoding_t *enc)
-{
-    if (enc == self->encoding) {
-	return;
-    }
-    str_make_data_binary(self);
-    if (NATIVE_UTF16_ENC(self->encoding)) {
-	str_set_stored_in_uchars(self, false);
-    }
-    self->encoding = enc;
-    str_unset_facultative_flags(self);
-    if (NATIVE_UTF16_ENC(self->encoding)) {
-	str_set_stored_in_uchars(self, true);
-    }
-}
-
 static rb_str_t *
 str_new_similar_empty_string(rb_str_t *self)
 {
@@ -661,109 +601,6 @@
     return boundaries;
 }
 
-character_boundaries_t
-str_get_next_line_end_character_boundaries(rb_str_t *self,
-	long start_offset_in_bytes)
-{
-    character_boundaries_t boundaries = {self->length_in_bytes,
-	self->length_in_bytes};
-
-    if (start_offset_in_bytes >= self->length_in_bytes) {
-	return boundaries;
-    }
-
-    if (str_is_stored_in_uchars(self) || NON_NATIVE_UTF16_ENC(self->encoding)) {
-	UChar line_feed, carriage_return;
-	if (str_is_stored_in_uchars(self)) {
-	    line_feed = 0x0A;
-	    carriage_return = 0x0D;
-	}
-	else {
-	    line_feed = 0x0A00;
-	    carriage_return = 0x0D00;
-	}
-	const long start_offset = BYTES_TO_UCHARS(start_offset_in_bytes);
-	const long length = BYTES_TO_UCHARS(self->length_in_bytes);
-	for (long i = start_offset; i < length; ++i) {
-	    const UChar c = self->data.uchars[i];
-	    if (c == line_feed) {
-		boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(i);
-		boundaries.end_offset_in_bytes = UCHARS_TO_BYTES(i+1);
-		return boundaries;
-	    }
-	    else if (c == carriage_return) {
-		boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(i);
-		if ((i+1 < length) && (self->data.uchars[i+1] == line_feed)) {
-		    boundaries.end_offset_in_bytes = UCHARS_TO_BYTES(i+2);
-		}
-		else {
-		    boundaries.end_offset_in_bytes = UCHARS_TO_BYTES(i+1);
-		}
-		return boundaries;
-	    }
-	}
-    }
-    else if (self->encoding->ascii_compatible) {
-	const char line_feed = 0x0A, carriage_return = 0x0D;
-	for (long i = start_offset_in_bytes; i < self->length_in_bytes; ++i) {
-	    const char c = self->data.bytes[i];
-	    if (c == line_feed) {
-		boundaries.start_offset_in_bytes = i;
-		boundaries.end_offset_in_bytes = i+1;
-		return boundaries;
-	    }
-	    else if (c == carriage_return) {
-		boundaries.start_offset_in_bytes = i;
-		if ((i+1 < self->length_in_bytes)
-			&& (self->data.bytes[i+1] == line_feed)) {
-		    boundaries.end_offset_in_bytes = i+2;
-		}
-		else {
-		    boundaries.end_offset_in_bytes = i+1;
-		}
-		return boundaries;
-	    }
-	}
-    }
-    else if (UTF32_ENC(self->encoding)) {
-	int32_t line_feed, carriage_return;
-	if (NATIVE_UTF32_ENC(self->encoding)) {
-	    line_feed = 0x0A;
-	    carriage_return = 0x0D;
-	}
-	else {
-	    line_feed = 0x0A000000;
-	    carriage_return = 0x0D000000;
-	}
-	const long start_offset = start_offset_in_bytes / 4;
-	const long length = self->length_in_bytes / 4;
-	for (long i = start_offset; i < length; ++i) {
-	    int32_t c = ((int32_t *)self->data.bytes)[i];
-	    if (c == line_feed) {
-		boundaries.start_offset_in_bytes = i * 4;
-		boundaries.end_offset_in_bytes = (i+1) * 4;
-		return boundaries;
-	    }
-	    else if (c == carriage_return) {
-		boundaries.start_offset_in_bytes = i * 4;
-		if ((i+1 < length)
-			&& (((int32_t *)self->data.bytes)[i+1] == line_feed)) {
-		    boundaries.end_offset_in_bytes = (i+2) * 4;
-		}
-		else {
-		    boundaries.end_offset_in_bytes = (i+1) * 4;
-		}
-		return boundaries;
-	    }
-	}
-    }
-    else {
-	abort(); // we should never get there
-    }
-
-    return boundaries;
-}
-
 static rb_str_t *
 str_get_characters(rb_str_t *self, long first, long last, bool ucs2_mode)
 {
@@ -813,74 +650,6 @@
 	    - first_boundaries.start_offset_in_bytes);
 }
 
-static rb_str_t *
-str_get_character_at(rb_str_t *self, long index, bool ucs2_mode)
-{
-    if (self->length_in_bytes == 0) {
-	return NULL;
-    }
-    if (!self->encoding->single_byte_encoding
-	    && !str_is_stored_in_uchars(self)) {
-	// if we can't access the bytes directly,
-	// try to convert the string in UTF-16
-	str_try_making_data_uchars(self);
-    }
-    character_boundaries_t boundaries = str_get_character_boundaries(self,
-	    index, ucs2_mode);
-    if (boundaries.start_offset_in_bytes == -1) {
-	if (boundaries.end_offset_in_bytes == -1) {
-	    return NULL;
-	}
-	else {
-	    // you cannot cut a surrogate in an encoding that is not UTF-16
-	    str_cannot_cut_surrogate();
-	}
-    }
-    else if (boundaries.end_offset_in_bytes == -1) {
-	// you cannot cut a surrogate in an encoding that is not UTF-16
-	str_cannot_cut_surrogate();
-    }
-
-    if (boundaries.start_offset_in_bytes >= self->length_in_bytes) {
-	return NULL;
-    }
-    if (boundaries.end_offset_in_bytes >= self->length_in_bytes) {
-	boundaries.end_offset_in_bytes = self->length_in_bytes;
-    }
-
-    return str_new_copy_of_part(self, boundaries.start_offset_in_bytes,
-	    boundaries.end_offset_in_bytes
-	    - boundaries.start_offset_in_bytes);
-}
-
-static rb_str_t *
-str_plus_string(rb_str_t *str1, rb_str_t *str2)
-{
-    rb_encoding_t *new_encoding = str_must_have_compatible_encoding(str1, str2);
-
-    rb_str_t *new_str = str_alloc(rb_cRubyString);
-    new_str->encoding = new_encoding;
-    if ((str1->length_in_bytes == 0) && (str2->length_in_bytes == 0)) {
-	return new_str;
-    }
-
-    str_make_same_format(str1, str2);
-
-    str_set_stored_in_uchars(new_str, str_is_stored_in_uchars(str1));
-    long length_in_bytes = str1->length_in_bytes + str2->length_in_bytes;
-    GC_WB(&new_str->data.bytes, xmalloc(length_in_bytes));
-    if (str1->length_in_bytes > 0) {
-	memcpy(new_str->data.bytes, str1->data.bytes, str1->length_in_bytes);
-    }
-    if (str2->length_in_bytes > 0) {
-	memcpy(new_str->data.bytes + str1->length_in_bytes, str2->data.bytes,
-		str2->length_in_bytes);
-    }
-    new_str->capacity_in_bytes = new_str->length_in_bytes = length_in_bytes;
-
-    return new_str;
-}
-
 static void
 str_resize_bytes(rb_str_t *self, long new_capacity)
 {
@@ -1110,16 +879,11 @@
 static rb_str_t *
 str_need_string(VALUE str)
 {
-    if (!SPECIAL_CONST_P(str) && IS_RSTR(str)) {
-	return (rb_str_t *)str;
-    }
     if (TYPE(str) != T_STRING) {
 	str = rb_str_to_str(str);
     }
-    if (IS_RSTR(str)) {
-	return (rb_str_t *)str;
-    }
-    return str_new_from_cfstring((CFStringRef)str);
+    return IS_RSTR(str)
+	? (rb_str_t *)str : str_new_from_cfstring((CFStringRef)str);
 }
 
 void
@@ -1164,6 +928,28 @@
     *need_free_p = need_free;
 }
 
+static VALUE
+str_substr(VALUE str, long beg, long len)
+{
+    if (len < 0) {
+	return Qnil;
+    }
+
+    const long n = str_length(RSTR(str), true);
+    if (beg < 0) {
+	beg += n;
+    }
+    if (beg > n || beg < 0) {
+	return Qnil;
+    }
+    if (beg + len > n) {
+	len = n - beg;
+    }
+
+    rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, true);
+    return substr == NULL ? Qnil : (VALUE)substr;
+}
+
 //----------------------------------------------
 // Functions called by MacRuby
 
@@ -1185,13 +971,20 @@
 }
 
 static VALUE
-mr_str_s_alloc(VALUE klass)
+rstr_alloc(VALUE klass, SEL sel)
 {
     return (VALUE)str_alloc(klass);
 }
 
+/*
+ *  call-seq:
+ *     String.new(str="")   => new_str
+ *  
+ *  Returns a new string object containing a copy of <i>str</i>.
+ */
+
 static VALUE
-mr_str_initialize(VALUE self, SEL sel, int argc, VALUE *argv)
+rstr_initialize(VALUE self, SEL sel, int argc, VALUE *argv)
 {
     if (argc > 0) {
 	assert(argc == 1);
@@ -1200,272 +993,616 @@
     return self;
 }
 
+/*
+ *  call-seq:
+ *     str.replace(other_str)   => str
+ *  
+ *  Replaces the contents and taintedness of <i>str</i> with the corresponding
+ *  values in <i>other_str</i>.
+ *     
+ *     s = "hello"         #=> "hello"
+ *     s.replace "world"   #=> "world"
+ */
+
 static VALUE
-mr_str_replace(VALUE self, SEL sel, VALUE arg)
+rstr_replace(VALUE self, SEL sel, VALUE arg)
 {
+    rstr_modify(self);
     str_replace(RSTR(self), arg);
     return self;
 }
 
 static VALUE
-mr_str_clear(VALUE self, SEL sel)
+rstr_copy(VALUE rcv, VALUE klass)
 {
-    str_clear(RSTR(self));
+    VALUE dup = rstr_alloc(klass, 0);
+    rstr_replace(dup, 0, rcv);
+    return dup;
+}
+
+static VALUE
+rstr_dup(VALUE str, SEL sel)
+{
+    VALUE klass = CLASS_OF(str);
+    while (RCLASS_SINGLETON(klass)) {
+	klass = RCLASS_SUPER(klass);
+    }
+    assert(rb_klass_is_rstr(klass));
+
+    VALUE dup = rstr_copy(str, klass);
+
+    if (OBJ_TAINTED(str)) {
+	OBJ_TAINT(dup);
+    }
+    if (OBJ_UNTRUSTED(str)) {
+	OBJ_UNTRUST(dup);
+    }
+    return dup;
+}
+
+static VALUE
+rstr_clone(VALUE str, SEL sel)
+{
+    VALUE clone = rstr_copy(str, CLASS_OF(str));
+
+    if (OBJ_TAINTED(str)) {
+	OBJ_TAINT(clone);
+    }
+    if (OBJ_UNTRUSTED(str)) {
+	OBJ_UNTRUST(clone);
+    }
+    if (OBJ_FROZEN(str)) {
+	OBJ_FREEZE(clone);
+    }
+    return clone;
+}
+
+/*
+ *  call-seq:
+ *     string.clear    ->  string
+ *
+ *  Makes string empty.
+ *
+ *     a = "abcde"
+ *     a.clear    #=> ""
+ */
+
+static VALUE
+rstr_clear(VALUE self, SEL sel)
+{
+    rstr_modify(self);
+    RSTR(self)->length_in_bytes = 0;
     return self;
 }
 
 static VALUE
-mr_str_chars_count(VALUE self, SEL sel)
+rstr_chars_count(VALUE self, SEL sel)
 {
     return INT2NUM(str_length(RSTR(self), false));
 }
 
+/*
+ *  call-seq:
+ *     str.length   => integer
+ *     str.size     => integer
+ *  
+ *  Returns the character length of <i>str</i>.
+ */
+
 static VALUE
-mr_str_length(VALUE self, SEL sel)
+rstr_length(VALUE self, SEL sel)
 {
     return INT2NUM(str_length(RSTR(self), true));
 }
 
+/*
+ *  call-seq:
+ *     str.bytesize  => integer
+ *  
+ *  Returns the length of <i>str</i> in bytes.
+ */
+
 static VALUE
-mr_str_bytesize(VALUE self, SEL sel)
+rstr_bytesize(VALUE self, SEL sel)
 {
     return INT2NUM(str_bytesize(RSTR(self)));
 }
 
 static VALUE
-mr_str_encoding(VALUE self, SEL sel)
+rstr_encoding(VALUE self, SEL sel)
 {
     return (VALUE)RSTR(self)->encoding;
 }
 
+/*
+ *  call-seq:
+ *     str.getbyte(index)          => 0 .. 255
+ *
+ *  returns the <i>index</i>th byte as an integer.
+ */
+
 static VALUE
-mr_str_getbyte(VALUE self, SEL sel, VALUE index)
+rstr_getbyte(VALUE self, SEL sel, VALUE index)
 {
-    unsigned char c;
-    if (str_getbyte(RSTR(self), NUM2LONG(index), &c)) {
-	return INT2NUM(c);
+    unsigned char c = 0;
+    long idx = NUM2LONG(index);
+
+    if (str_is_stored_in_uchars(RSTR(self))
+	    && NATIVE_UTF16_ENC(RSTR(self)->encoding)) {
+	if (idx < 0) {
+	    idx += RSTR(self)->length_in_bytes;
+	    if (idx < 0) {
+		return Qnil;
+	    }
+	}
+	if (idx >= RSTR(self)->length_in_bytes) {
+	    return Qnil;
+	}
+	if (NATIVE_UTF16_ENC(RSTR(self)->encoding)) {
+	    c = RSTR(self)->data.bytes[idx];
+	}
+	else { // non native byte-order UTF-16
+	    if ((idx & 1) == 0) { // even
+		c = RSTR(self)->data.bytes[idx+1];
+	    }
+	    else { // odd
+		c = RSTR(self)->data.bytes[idx-1];
+	    }
+	}
     }
     else {
-	return Qnil;
+	// work with a binary string
+	// (UTF-16 strings could be converted to their binary form
+	//  on the fly but that would just add complexity)
+	str_make_data_binary(RSTR(self));
+
+	if (idx < 0) {
+	    idx += RSTR(self)->length_in_bytes;
+	    if (idx < 0) {
+		return Qnil;
+	    }
+	}
+	if (idx >= RSTR(self)->length_in_bytes) {
+	    return Qnil;
+	}
+	c = RSTR(self)->data.bytes[idx];
     }
+
+    return INT2FIX(c); 
 }
 
+/*
+ *  call-seq:
+ *     str.setbyte(index, int) => int
+ *
+ *  modifies the <i>index</i>th byte as <i>int</i>.
+ */
+
 static VALUE
-mr_str_setbyte(VALUE self, SEL sel, VALUE index, VALUE value)
+rstr_setbyte(VALUE self, SEL sel, VALUE index, VALUE value)
 {
-    str_setbyte(RSTR(self), NUM2LONG(index),
-	    0xFF & (unsigned long)NUM2LONG(value));
+    rstr_modify(self);
+    str_make_data_binary(RSTR(self));
+    if ((index < -RSTR(self)->length_in_bytes)
+	    || (index >= RSTR(self)->length_in_bytes)) {
+	rb_raise(rb_eIndexError, "index %ld out of string", index);
+    }
+    if (index < 0) {
+	index += RSTR(self)->length_in_bytes;
+    }
+    RSTR(self)->data.bytes[index] = value;
     return value;
 }
 
+/*
+ *  call-seq:
+ *     str.force_encoding(encoding)   => str
+ *
+ *  Changes the encoding to +encoding+ and returns self.
+ */
+
 static VALUE
-mr_str_force_encoding(VALUE self, SEL sel, VALUE encoding)
+rstr_force_encoding(VALUE self, SEL sel, VALUE encoding)
 {
-    rb_encoding_t *enc;
-    if (SPECIAL_CONST_P(encoding) || (CLASS_OF(encoding) != rb_cEncoding)) {
-	abort(); // TODO
+    rstr_modify(self);
+    rb_encoding_t *enc = rb_to_encoding(encoding);
+    if (enc != RSTR(self)->encoding) {
+	str_make_data_binary(RSTR(self));
+	if (NATIVE_UTF16_ENC(RSTR(self)->encoding)) {
+	    str_set_stored_in_uchars(RSTR(self), false);
+	}
+	RSTR(self)->encoding = enc;
+	str_unset_facultative_flags(RSTR(self));
+	if (NATIVE_UTF16_ENC(RSTR(self)->encoding)) {
+	    str_set_stored_in_uchars(RSTR(self), true);
+	}
     }
-    enc = (rb_encoding_t *)encoding;
-    str_force_encoding(RSTR(self), enc);
     return self;
 }
 
+/*
+ *  call-seq:
+ *     str.valid_encoding?  => true or false
+ *  
+ *  Returns true for a string which encoded correctly.
+ *
+ *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
+ *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
+ *    "\x80".force_encoding("UTF-8").valid_encoding? => false
+ */
+
 static VALUE
-mr_str_is_valid_encoding(VALUE self, SEL sel)
+rstr_is_valid_encoding(VALUE self, SEL sel)
 {
     return str_is_valid_encoding(RSTR(self)) ? Qtrue : Qfalse;
 }
 
+/*
+ *  call-seq:
+ *     str.ascii_only?  => true or false
+ *  
+ *  Returns true for a string which has only ASCII characters.
+ *
+ *    "abc".force_encoding("UTF-8").ascii_only? => true
+ *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
+ */
+
 static VALUE
-mr_str_is_ascii_only(VALUE self, SEL sel)
+rstr_is_ascii_only(VALUE self, SEL sel)
 {
     return str_is_ruby_ascii_only(RSTR(self)) ? Qtrue : Qfalse;
 }
 
+/*
+ *  call-seq:
+ *     str[fixnum]                 => new_str or nil
+ *     str[fixnum, fixnum]         => new_str or nil
+ *     str[range]                  => new_str or nil
+ *     str[regexp]                 => new_str or nil
+ *     str[regexp, fixnum]         => new_str or nil
+ *     str[other_str]              => new_str or nil
+ *     str.slice(fixnum)           => new_str or nil
+ *     str.slice(fixnum, fixnum)   => new_str or nil
+ *     str.slice(range)            => new_str or nil
+ *     str.slice(regexp)           => new_str or nil
+ *     str.slice(regexp, fixnum)   => new_str or nil
+ *     str.slice(other_str)        => new_str or nil
+ *  
+ *  Element Reference---If passed a single <code>Fixnum</code>, returns a
+ *  substring of one character at that position. If passed two <code>Fixnum</code>
+ *  objects, returns a substring starting at the offset given by the first, and
+ *  a length given by the second. If given a range, a substring containing
+ *  characters at offsets given by the range is returned. In all three cases, if
+ *  an offset is negative, it is counted from the end of <i>str</i>. Returns
+ *  <code>nil</code> if the initial offset falls outside the string, the length
+ *  is negative, or the beginning of the range is greater than the end.
+ *     
+ *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
+ *  returned. If a numeric parameter follows the regular expression, that
+ *  component of the <code>MatchData</code> is returned instead. If a
+ *  <code>String</code> is given, that string is returned if it occurs in
+ *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
+ *  match.
+ *     
+ *     a = "hello there"
+ *     a[1]                   #=> "e"
+ *     a[1,3]                 #=> "ell"
+ *     a[1..3]                #=> "ell"
+ *     a[-3,2]                #=> "er"
+ *     a[-4..-2]              #=> "her"
+ *     a[12..-1]              #=> nil
+ *     a[-2..-4]              #=> ""
+ *     a[/[aeiou](.)\1/]      #=> "ell"
+ *     a[/[aeiou](.)\1/, 0]   #=> "ell"
+ *     a[/[aeiou](.)\1/, 1]   #=> "l"
+ *     a[/[aeiou](.)\1/, 2]   #=> nil
+ *     a["lo"]                #=> "lo"
+ *     a["bye"]               #=> nil
+ */
+
 static VALUE
-mr_str_aref(VALUE self, SEL sel, int argc, VALUE *argv)
+rstr_aref(VALUE str, SEL sel, int argc, VALUE *argv)
 {
-    rb_str_t *ret;
-    if (argc == 1) {
-	VALUE index = argv[0];
-	switch (TYPE(index)) {
-	    case T_FIXNUM:
-		ret = str_get_character_at(RSTR(self), FIX2LONG(index), true);
-		break;
+    if (argc == 2) {
+	if (TYPE(argv[0]) == T_REGEXP) {
+	    // TODO
+	    //return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
+	    return Qnil;
+	}
+	return str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
+    }
 
-	    case T_REGEXP:
-		abort(); // TODO
+    if (argc != 1) {
+	rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
+    }
 
-	    case T_STRING:
-		{
-		    if (IS_RSTR(index)) {
-			rb_str_t *searched = RSTR(index);
-			if (str_include_string(RSTR(self), searched)) {
-			    return (VALUE)str_new_from_string(searched);
-			}
+    VALUE indx = argv[0];
+    switch (TYPE(indx)) {
+	case T_FIXNUM:
+	    str = str_substr(str, FIX2LONG(indx), 1);
+	    if (!NIL_P(str) && str_length(RSTR(str), true) == 0) {
+		return Qnil;
+	    }
+	    return str;
+
+	case T_REGEXP:
+	    abort(); // TODO
+
+	case T_STRING:
+	    {
+		if (IS_RSTR(indx)) {
+		    rb_str_t *searched = RSTR(indx);
+		    if (str_include_string(RSTR(str), searched)) {
+			return (VALUE)str_dup(searched);
 		    }
-		    else {
-			rb_str_t *searched =
-			    str_new_from_cfstring((CFStringRef)index);
-			if (str_include_string(RSTR(self), searched)) {
-			    // no need to duplicate the string as we just
-			    // created it
-			    return (VALUE)searched;
-			}
+		}
+		else {
+		    rb_str_t *searched =
+			str_new_from_cfstring((CFStringRef)indx);
+		    if (str_include_string(RSTR(str), searched)) {
+			// no need to duplicate the string as we just
+			// created it
+			return (VALUE)searched;
 		    }
-		    return Qnil;
 		}
+		return Qnil;
+	    }
 
-	    default:
-		{
-		    VALUE rb_start = 0, rb_end = 0;
-		    int exclude_end = false;
-		    if (rb_range_values(index, &rb_start, &rb_end,
-				&exclude_end)) {
-			long start = NUM2LONG(rb_start);
-			long end = NUM2LONG(rb_end);
-			if (exclude_end) {
-			    --end;
-			}
-			ret = str_get_characters(RSTR(self), start, end, true);
+	default:
+	    {
+		VALUE rb_start = 0, rb_end = 0;
+		int exclude_end = false;
+		if (rb_range_values(indx, &rb_start, &rb_end,
+			    &exclude_end)) {
+		    long start = NUM2LONG(rb_start);
+		    long end = NUM2LONG(rb_end);
+		    if (exclude_end) {
+			--end;
 		    }
-		    else {
-			ret = str_get_character_at(RSTR(self), NUM2LONG(index),
-				true);
+		    return str_substr(str, start, end - start + 1);
+		}
+		else {
+		    str = str_substr(str, NUM2LONG(indx), 1);
+		    if (!NIL_P(str) && str_length(RSTR(str), true) == 0) {
+			return Qnil;
 		    }
+		    return str;
 		}
-		break;
-	}
+	    }
     }
-    else if (argc == 2) {
-	long length = NUM2LONG(argv[1]);
-	long start = NUM2LONG(argv[0]);
-	if (length < 0) {
-	    return Qnil;
-	}
-	long end = start + length - 1;
-	if ((start < 0) && (end >= 0)) {
-	    end = -1;
-	}
-	ret = str_get_characters(RSTR(self), start, end, true);
-    }
-    else {
-	rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
-    }
-
-    if (ret == NULL) {
-	return Qnil;
-    }
-    else {
-	return (VALUE)ret;
-    }
 }
 
+/*
+ *  call-seq:
+ *     str.index(substring [, offset])   => fixnum or nil
+ *     str.index(fixnum [, offset])      => fixnum or nil
+ *     str.index(regexp [, offset])      => fixnum or nil
+ *  
+ *  Returns the index of the first occurrence of the given <i>substring</i>,
+ *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
+ *  <code>nil</code> if not found. If the second parameter is present, it
+ *  specifies the position in the string to begin the search.
+ *     
+ *     "hello".index('e')             #=> 1
+ *     "hello".index('lo')            #=> 3
+ *     "hello".index('a')             #=> nil
+ *     "hello".index(?e)              #=> 1
+ *     "hello".index(101)             #=> 1
+ *     "hello".index(/[aeiou]/, -3)   #=> 4
+ */
+
 static VALUE
-mr_str_index(VALUE self, SEL sel, int argc, VALUE *argv)
+rstr_index(VALUE self, SEL sel, int argc, VALUE *argv)
 {
-    if ((argc < 1) || (argc > 2)) {
-	rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
+    VALUE sub, initpos;
+    long pos;
+
+    if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
+	pos = NUM2LONG(initpos);
     }
+    else {
+	pos = 0;
+    }
 
-    VALUE rb_searched = argv[0];
-    if (TYPE(rb_searched) == T_REGEXP) {
-	abort(); // TODO
+    if (pos < 0) {
+	pos += str_length(RSTR(self), true);
+	if (pos < 0) {
+	    if (TYPE(sub) == T_REGEXP) {
+		rb_backref_set(Qnil);
+	    }
+	    return Qnil;
+	}
     }
 
-    long start_index = 0;
-    if (argc == 2) {
-	start_index = NUM2LONG(argv[1]);
+    switch (TYPE(sub)) {
+	case T_REGEXP:
+	    // TODO
+	    //pos = rb_reg_adjust_startpos(sub, str, pos, 0);
+	    //pos = rb_reg_search(sub, str, pos, 0);
+	    //pos = rb_str_sublen(str, pos);
+	    break;
+
+	default: 
+	    {
+		VALUE tmp = rb_check_string_type(sub);
+		if (NIL_P(tmp)) {
+		    rb_raise(rb_eTypeError, "type mismatch: %s given",
+			    rb_obj_classname(sub));
+		}
+		sub = tmp;
+	    }
+	    /* fall through */
+	case T_STRING:
+	    {
+		rb_str_t *substr = str_need_string(sub);
+		pos = str_index_for_string(RSTR(self), substr, pos, true);
+	    }
+	    break;
     }
-    rb_str_t *searched = str_need_string(rb_searched);
 
-    long index = str_index_for_string(RSTR(self), searched, start_index, true);
-    if (index == -1) {
+    if (pos == -1) {
 	return Qnil;
     }
-    else {
-	return INT2NUM(index);
-    }
+    return LONG2NUM(pos);
 }
 
 static VALUE
-mr_str_getchar(VALUE self, SEL sel, VALUE index)
+rstr_getchar(VALUE self, SEL sel, VALUE index)
 {
-    rb_str_t *ret = str_get_character_at(RSTR(self), FIX2LONG(index), false);
-    if (ret == NULL) {
-	return Qnil;
-    }
-    else {
-	return (VALUE)ret;
-    }
+    const long idx = FIX2LONG(index);
+    return str_substr(self, idx, 1);
 }
 
+/*
+ *  call-seq:
+ *     str + other_str   => new_str
+ *  
+ *  Concatenation---Returns a new <code>String</code> containing
+ *  <i>other_str</i> concatenated to <i>str</i>.
+ *     
+ *     "Hello from " + self.to_s   #=> "Hello from main"
+ */
+
 static VALUE
-mr_str_plus(VALUE self, SEL sel, VALUE to_add)
+rstr_plus(VALUE self, SEL sel, VALUE other)
 {
-    return (VALUE)str_plus_string(RSTR(self), str_need_string(to_add));
+    rb_str_t *newstr = str_dup(RSTR(self));
+    str_concat_string(newstr, str_need_string(other));
+    return (VALUE)newstr;
 }
 
+/*
+ *  call-seq:
+ *     str << fixnum        => str
+ *     str.concat(fixnum)   => str
+ *     str << obj           => str
+ *     str.concat(obj)      => str
+ *  
+ *  Append---Concatenates the given object to <i>str</i>. If the object is a
+ *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
+ *  to a character before concatenation.
+ *     
+ *     a = "hello "
+ *     a << "world"   #=> "hello world"
+ *     a.concat(33)   #=> "hello world!"
+ */
+
 static VALUE
-mr_str_concat(VALUE self, SEL sel, VALUE to_concat)
+rstr_concat(VALUE self, SEL sel, VALUE other)
 {
-    switch (TYPE(to_concat)) {
+    rstr_modify(self);
+    switch (TYPE(other)) {
 	case T_FIXNUM:
-	case T_BIGNUM:
 	    abort(); // TODO
 
 	default:
-	    str_concat_string(RSTR(self), str_need_string(to_concat));
+	    str_concat_string(RSTR(self), str_need_string(other));
     }
     return self;
 }
 
+/*
+ *  call-seq:
+ *     str == obj   => true or false
+ *  
+ *  Equality---If <i>obj</i> is not a <code>String</code>, returns
+ *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
+ *  <code><=></code> <i>obj</i> returns zero.
+ */
+
 static VALUE
-mr_str_equal(VALUE self, SEL sel, VALUE compared_to)
+rstr_equal(VALUE self, SEL sel, VALUE other)
 {
-    if (SPECIAL_CONST_P(compared_to)) {
-	return Qfalse;
+    if (self == other) {
+	return Qtrue;
     }
 
-    if (TYPE(compared_to) == T_STRING) {
-	rb_str_t *str;
-	if (IS_RSTR(compared_to)) {
-	    str = RSTR(compared_to);
+    if (TYPE(other) != T_STRING) {
+	if (!rb_respond_to(other, rb_intern("to_str"))) {
+	    return Qfalse;
 	}
-	else {
-	    str = str_new_from_cfstring((CFStringRef)compared_to);
-	}
-	return str_is_equal_to_string(RSTR(self), str) ? Qtrue : Qfalse;
+	return rb_equal(other, self);
     }
+
+    rb_str_t *str;
+    if (IS_RSTR(other)) {
+	str = RSTR(other);
+    }
     else {
-	return Qfalse;
+	str = str_new_from_cfstring((CFStringRef)other);
     }
+    return str_is_equal_to_string(RSTR(self), str) ? Qtrue : Qfalse;
 }
 
+/*
+ *  call-seq:
+ *     str.include? other_str   => true or false
+ *     str.include? fixnum      => true or false
+ *  
+ *  Returns <code>true</code> if <i>str</i> contains the given string or
+ *  character.
+ *     
+ *     "hello".include? "lo"   #=> true
+ *     "hello".include? "ol"   #=> false
+ *     "hello".include? ?h     #=> true
+ */
+
 static VALUE
-mr_str_include(VALUE self, SEL sel, VALUE searched)
+rstr_includes(VALUE self, SEL sel, VALUE searched)
 {
     return str_include_string(RSTR(self), str_need_string(searched))
 	? Qtrue : Qfalse;
 }
 
 static VALUE
-mr_str_is_stored_in_uchars(VALUE self, SEL sel)
+rstr_is_stored_in_uchars(VALUE self, SEL sel)
 {
     return str_is_stored_in_uchars(RSTR(self)) ? Qtrue : Qfalse;
 }
 
+/*
+ *  call-seq:
+ *     str.to_s     => str
+ *     str.to_str   => str
+ *  
+ *  Returns the receiver.
+ */
+
 static VALUE
-mr_str_to_s(VALUE self, SEL sel)
+rstr_to_s(VALUE self, SEL sel)
 {
     if (CLASS_OF(self) != rb_cRubyString) {
-	return (VALUE)str_dup(self);
+	VALUE dup = (VALUE)str_dup(RSTR(self));
+	if (OBJ_TAINTED(self)) {
+	    OBJ_TAINT(dup);
+	}
+	return dup;
     }
     return self;
 }
 
+/*
+ *  call-seq:
+ *     str.intern   => symbol
+ *     str.to_sym   => symbol
+ *  
+ *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
+ *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
+ *     
+ *     "Koala".intern         #=> :Koala
+ *     s = 'cat'.to_sym       #=> :cat
+ *     s == :cat              #=> true
+ *     s = '@cat'.to_sym      #=> :@cat
+ *     s == :@cat             #=> true
+ *
+ *  This can also be used to create symbols that cannot be represented using the
+ *  <code>:xxx</code> notation.
+ *     
+ *     'cat and dog'.to_sym   #=> :"cat and dog"
+ */
+
 static VALUE
-mr_str_intern(VALUE self, SEL sel)
+rstr_intern(VALUE self, SEL sel)
 {
     if (OBJ_TAINTED(self) && rb_safe_level() >= 1) {
 	rb_raise(rb_eSecurityError, "Insecure: can't intern tainted string");
@@ -1474,6 +1611,128 @@
     return ID2SYM(rb_intern(RSTR(self)->data.bytes));
 }
 
+/*
+ * call-seq:
+ *   str.inspect   => string
+ *
+ * Returns a printable version of _str_, surrounded by quote marks,
+ * with special characters escaped.
+ *
+ *    str = "hello"
+ *    str[3] = "\b"
+ *    str.inspect       #=> "\"hel\\bo\""
+ */
+
+static void
+inspect_append(VALUE result, UChar c, bool escape)
+{
+    if (escape) {
+	str_append_uchar(RSTR(result), '\\');
+    }
+    str_append_uchar(RSTR(result), c);
+}
+
+static VALUE
+str_inspect(VALUE str, bool dump)
+{
+    const bool uchars = str_is_stored_in_uchars(RSTR(str));
+    const long len = uchars
+	? str_length(RSTR(str), false) : RSTR(str)->length_in_bytes;
+
+    if (len == 0) {
+	return rb_str_new2("\"\"");
+    }
+
+    // Allocate an UTF-8 string with a good initial capacity.
+    // Binary strings will likely have most bytes escaped.
+    const long result_init_len =
+	BINARY_ENC(RSTR(str)->encoding) ? (len * 5) + 2 : len + 2;
+    VALUE result = rb_unicode_str_new(NULL, result_init_len);
+
+#define GET_UCHAR(pos) \
+    ((uchars \
+      ? RSTR(str)->data.uchars[pos] : (UChar)RSTR(str)->data.bytes[pos]))
+
+    inspect_append(result, '"', false);
+    for (long i = 0; i < len; i++) {
+	const UChar c = GET_UCHAR(i);
+
+	if (iswprint(c)) {
+	    if (c == '"' || c == '\\') {
+		inspect_append(result, c, true);
+	    }
+	    else if (dump && c == '#' && i + 1 < len) {
+		const UChar c2 = GET_UCHAR(i + 1);
+		const bool need_escape = c2 == '$' || c2 == '@' || c2 == '{';
+		inspect_append(result, c, need_escape);
+	    }
+	    else {
+		inspect_append(result, c, false);
+	    }
+	}
+	else if (c == '\n') {
+	    inspect_append(result, 'n', true);
+	} 
+	else if (c == '\r') {
+	    inspect_append(result, 'r', true);
+	} 
+	else if (c == '\t') {
+	    inspect_append(result, 't', true);
+	} 
+	else if (c == '\f') {
+	    inspect_append(result, 'f', true);
+	}
+	else if (c == '\013') {
+	    inspect_append(result, 'v', true);
+	}
+	else if (c == '\010') {
+	    inspect_append(result, 'b', true);
+	}
+	else if (c == '\007') {
+	    inspect_append(result, 'a', true);
+	}
+	else if (c == 033) {
+	    inspect_append(result, 'e', true);
+	}
+	else {
+	    char buf[10];
+	    snprintf(buf, sizeof buf, "\\x%02X", c);
+	    char *p = buf;
+	    while (*p != '\0') {
+		inspect_append(result, *p, false);
+		p++;
+	    }
+	}
+    }
+    inspect_append(result, '"', false);
+   
+#undef GET_UCHAR
+
+    return result; 
+}
+
+static VALUE
+rstr_inspect(VALUE self, SEL sel)
+{
+    return str_inspect(self, false);
+}
+
+/*
+ *  call-seq:
+ *     str.dump   => new_str
+ *  
+ *  Produces a version of <i>str</i> with all nonprinting characters replaced by
+ *  <code>\nnn</code> notation and all special characters escaped.
+ */
+
+static VALUE
+rstr_dump(VALUE self, SEL sel)
+{
+    return str_inspect(self, true);
+}
+
+// NSString primitives.
+
 static CFIndex
 rstr_imp_length(void *rcv, SEL sel)
 {
@@ -1506,45 +1765,46 @@
     rb_set_class_path(rb_cRubyString, rb_cObject, "String");
     rb_const_set(rb_cObject, rb_intern("String"), rb_cRubyString);
 
-    rb_objc_define_method(*(VALUE *)rb_cRubyString, "alloc",
-	mr_str_s_alloc, 0);
-    rb_objc_define_method(rb_cRubyString, "initialize", mr_str_initialize, -1);
-    rb_objc_define_method(rb_cRubyString, "initialize_copy", mr_str_replace, 1);
-    rb_objc_define_method(rb_cRubyString, "replace", mr_str_replace, 1);
-    rb_objc_define_method(rb_cRubyString, "clear", mr_str_clear, 0);
-    rb_objc_define_method(rb_cRubyString, "encoding", mr_str_encoding, 0);
-    rb_objc_define_method(rb_cRubyString, "length", mr_str_length, 0);
-    rb_objc_define_method(rb_cRubyString, "size", mr_str_length, 0); // alias
-    rb_objc_define_method(rb_cRubyString, "bytesize", mr_str_bytesize, 0);
-    rb_objc_define_method(rb_cRubyString, "getbyte", mr_str_getbyte, 1);
-    rb_objc_define_method(rb_cRubyString, "setbyte", mr_str_setbyte, 2);
+    rb_objc_define_method(*(VALUE *)rb_cRubyString, "alloc", rstr_alloc, 0);
+    rb_objc_define_method(rb_cRubyString, "initialize", rstr_initialize, -1);
+    rb_objc_define_method(rb_cRubyString, "initialize_copy", rstr_replace, 1);
+    rb_objc_define_method(rb_cRubyString, "dup", rstr_dup, 0);
+    rb_objc_define_method(rb_cRubyString, "clone", rstr_clone, 0);
+    rb_objc_define_method(rb_cRubyString, "replace", rstr_replace, 1);
+    rb_objc_define_method(rb_cRubyString, "clear", rstr_clear, 0);
+    rb_objc_define_method(rb_cRubyString, "encoding", rstr_encoding, 0);
+    rb_objc_define_method(rb_cRubyString, "length", rstr_length, 0);
+    rb_objc_define_method(rb_cRubyString, "size", rstr_length, 0); // alias
+    rb_objc_define_method(rb_cRubyString, "bytesize", rstr_bytesize, 0);
+    rb_objc_define_method(rb_cRubyString, "getbyte", rstr_getbyte, 1);
+    rb_objc_define_method(rb_cRubyString, "setbyte", rstr_setbyte, 2);
     rb_objc_define_method(rb_cRubyString, "force_encoding",
-	    mr_str_force_encoding, 1);
+	    rstr_force_encoding, 1);
     rb_objc_define_method(rb_cRubyString, "valid_encoding?",
-	    mr_str_is_valid_encoding, 0);
-    rb_objc_define_method(rb_cRubyString, "ascii_only?",
-	    mr_str_is_ascii_only, 0);
-    rb_objc_define_method(rb_cRubyString, "[]", mr_str_aref, -1);
-    rb_objc_define_method(rb_cRubyString, "index", mr_str_index, -1);
-    rb_objc_define_method(rb_cRubyString, "+", mr_str_plus, 1);
-    rb_objc_define_method(rb_cRubyString, "<<", mr_str_concat, 1);
-    rb_objc_define_method(rb_cRubyString, "concat", mr_str_concat, 1);
-    rb_objc_define_method(rb_cRubyString, "==", mr_str_equal, 1);
-    rb_objc_define_method(rb_cRubyString, "include?", mr_str_include, 1);
-    rb_objc_define_method(rb_cRubyString, "to_s", mr_str_to_s, 0);
-    rb_objc_define_method(rb_cRubyString, "to_str", mr_str_to_s, 0);
-    rb_objc_define_method(rb_cRubyString, "to_sym", mr_str_intern, 0);
-    rb_objc_define_method(rb_cRubyString, "intern", mr_str_intern, 0);
+	    rstr_is_valid_encoding, 0);
+    rb_objc_define_method(rb_cRubyString, "ascii_only?", rstr_is_ascii_only, 0);
+    rb_objc_define_method(rb_cRubyString, "[]", rstr_aref, -1);
+    rb_objc_define_method(rb_cRubyString, "slice", rstr_aref, -1);
+    rb_objc_define_method(rb_cRubyString, "index", rstr_index, -1);
+    rb_objc_define_method(rb_cRubyString, "+", rstr_plus, 1);
+    rb_objc_define_method(rb_cRubyString, "<<", rstr_concat, 1);
+    rb_objc_define_method(rb_cRubyString, "concat", rstr_concat, 1);
+    rb_objc_define_method(rb_cRubyString, "==", rstr_equal, 1);
+    rb_objc_define_method(rb_cRubyString, "include?", rstr_includes, 1);
+    rb_objc_define_method(rb_cRubyString, "to_s", rstr_to_s, 0);
+    rb_objc_define_method(rb_cRubyString, "to_str", rstr_to_s, 0);
+    rb_objc_define_method(rb_cRubyString, "to_sym", rstr_intern, 0);
+    rb_objc_define_method(rb_cRubyString, "intern", rstr_intern, 0);
+    rb_objc_define_method(rb_cRubyString, "inspect", rstr_inspect, 0);
+    rb_objc_define_method(rb_cRubyString, "dump", rstr_dump, 0);
 
-    // added for MacRuby
-    rb_objc_define_method(rb_cRubyString, "chars_count", mr_str_chars_count, 0);
-    rb_objc_define_method(rb_cRubyString, "getchar", mr_str_getchar, 1);
+    // Added for MacRuby (debugging).
+    rb_objc_define_method(rb_cRubyString, "__chars_count__",
+	    rstr_chars_count, 0);
+    rb_objc_define_method(rb_cRubyString, "__getchar__", rstr_getchar, 1);
+    rb_objc_define_method(rb_cRubyString, "__stored_in_uchars?__",
+	    rstr_is_stored_in_uchars, 0);
 
-    // this method does not exist in Ruby and is there only for debugging
-    // purpose
-    rb_objc_define_method(rb_cRubyString, "stored_in_uchars?",
-	    mr_str_is_stored_in_uchars, 0);
-
     // Cocoa primitives.
     rb_objc_install_method2((Class)rb_cRubyString, "length",
 	    (IMP)rstr_imp_length);
@@ -1659,9 +1919,9 @@
 VALUE
 rb_unicode_str_new(const UniChar *ptr, const size_t len)
 {
-    rb_str_t *str = str_alloc(rb_cRubyString);
-    str_replace_with_unichars(str, ptr, len);
-    return (VALUE)str;
+    VALUE str = rb_str_new_empty();
+    str_replace_with_uchars(RSTR(str), ptr, len);
+    return str;
 }
 
 VALUE
@@ -1680,7 +1940,7 @@
 		    // fall through
 
 		case T_STRING:
-		    mr_str_concat(str, 0, fragment);
+		    rstr_concat(str, 0, fragment);
 		    break;
 	    }
 	}
@@ -1698,7 +1958,7 @@
     // XXX should we assert that enc is single byte?
     if (enc == NULL) {
 	// This function can be called with a NULL encoding. 
-	enc = rb_encodings[ENCODING_BINARY];
+	enc = rb_encodings[ENCODING_UTF8];
     }
     rb_str_t *str = str_alloc(rb_cRubyString);
     str_replace_with_bytes(str, cstr, len, enc);
@@ -1708,7 +1968,7 @@
 VALUE
 rb_str_new(const char *cstr, long len)
 {
-    return rb_enc_str_new(cstr, len, rb_encodings[ENCODING_BINARY]);
+    return rb_enc_str_new(cstr, len, rb_encodings[ENCODING_UTF8]);
 }
 
 VALUE
@@ -1879,7 +2139,7 @@
 	    name = tmp;
 	    /* fall through */
 	case T_STRING:
-	    name = mr_str_intern(name, 0);
+	    name = rstr_intern(name, 0);
 	    /* fall through */
 	case T_SYMBOL:
 	    return SYM2ID(name);
@@ -1924,7 +2184,7 @@
 VALUE
 rb_str_buf_cat(VALUE str, const char *cstr, long len)
 {
-    return rb_enc_str_buf_cat(str, cstr, len, rb_encodings[ENCODING_BINARY]);
+    return rb_enc_str_buf_cat(str, cstr, len, RSTR(str)->encoding);
 }
 
 VALUE
@@ -1955,7 +2215,7 @@
 rb_str_buf_append(VALUE str, VALUE str2)
 {
     if (IS_RSTR(str)) {
-	return mr_str_concat(str, 0, str2);
+	return rstr_concat(str, 0, str2);
     }
     CFStringAppend((CFMutableStringRef)str, (CFStringRef)str2);
     return str;
@@ -2002,7 +2262,7 @@
 rb_str_equal(VALUE str, VALUE str2)
 {
     if (IS_RSTR(str)) {
-	return mr_str_equal(str, 0, str2);
+	return rstr_equal(str, 0, str2);
     }
     return CFEqual((CFStringRef)str, (CFStringRef)str2) ? Qtrue : Qfalse;
 }
@@ -2011,7 +2271,7 @@
 rb_str_dup(VALUE str)
 {
     if (IS_RSTR(str)) {
-	return (VALUE)str_dup(str);
+	return (VALUE)str_dup(RSTR(str));
     }
     if (TYPE(str) == T_SYMBOL) {
 	return rb_str_new2(RSYMBOL(str)->str);
@@ -2031,6 +2291,9 @@
 VALUE
 rb_str_inspect(VALUE rcv)
 {
+    if (RSTR(rcv)) {
+	return rstr_inspect(rcv, 0);
+    }
     // TODO
     return rcv;
 }

Modified: MacRuby/branches/icu/ucnv.c
===================================================================
--- MacRuby/branches/icu/ucnv.c	2010-02-19 23:08:37 UTC (rev 3580)
+++ MacRuby/branches/icu/ucnv.c	2010-02-20 04:23:17 UTC (rev 3581)
@@ -9,6 +9,7 @@
  * Copyright (C) 2000 Information-technology Promotion Agency, Japan
  */
 
+#include "ruby.h"
 #include "encoding.h"
 #include "unicode/ucnv.h"
 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20100219/27237c2d/attachment-0001.html>