[macruby-changes] [5049] MacRuby/trunk

Fri Dec 17 18:39:46 PST 2010

Revision: 5049
          http://trac.macosforge.org/projects/ruby/changeset/5049
Author:   vincent.isambart at gmail.com
Date:     2010-12-17 18:39:42 -0800 (Fri, 17 Dec 2010)
Log Message:
-----------
Started using C-blocks (note that they should not be used in
ObjC or sensitive places like dispatcher.cpp)

All this was to start cleaning-up string code.
I removed the function pointers for each encoding as I'm pretty sure no
one will use them to extend the encoding-handling and they make the code
harder to maintain.

Feature-wise currently the only change is that String#inspect is much
better when part of a string is invalid:
% ./miniruby -e 'p "?\227?\129?\130\xFF"'
"?\227?\129?\130\xFF"

Modified Paths:
--------------
    MacRuby/trunk/encoding.c
    MacRuby/trunk/encoding.h
    MacRuby/trunk/rakelib/builder/options.rb
    MacRuby/trunk/string.c
    MacRuby/trunk/ucnv.c
    MacRuby/trunk/vm.cpp

Added Paths:
-----------
    MacRuby/trunk/encoding_ucnv.h

Modified: MacRuby/trunk/encoding.c
===================================================================

--- MacRuby/trunk/encoding.c	2010-12-18 00:14:02 UTC (rev 5048)
+++ MacRuby/trunk/encoding.c	2010-12-18 02:39:42 UTC (rev 5049)
@@ -22,16 +22,6 @@
 static rb_encoding_t *default_external = NULL;
 rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
 
-static void str_undefined_update_flags(rb_str_t *self) { abort(); }
-static void str_undefined_make_data_binary(rb_str_t *self) { abort(); }
-static bool str_undefined_try_making_data_uchars(rb_str_t *self) { abort(); }
-static long str_undefined_length(rb_str_t *self, bool ucs2_mode) { abort(); }
-static long str_undefined_bytesize(rb_str_t *self) { abort(); }
-static character_boundaries_t str_undefined_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode) { abort(); }
-static long str_undefined_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode) { abort(); }
-static void str_undefined_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length) { abort(); }
-static void str_undefined_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *pos, char **bytes, long *bytes_length) { abort(); }
-
 static VALUE
 mr_enc_s_list(VALUE klass, SEL sel)
 {
@@ -273,22 +263,6 @@
     encoding->aliases_count = aliases_count;
     encoding->aliases = aliases;
 
-    // fill the default implementations with aborts
-    encoding->methods.update_flags = str_undefined_update_flags;
-    encoding->methods.make_data_binary = str_undefined_make_data_binary;
-    encoding->methods.try_making_data_uchars =
-	str_undefined_try_making_data_uchars;
-    encoding->methods.length = str_undefined_length;
-    encoding->methods.bytesize = str_undefined_bytesize;
-    encoding->methods.get_character_boundaries =
-	str_undefined_get_character_boundaries;
-    encoding->methods.offset_in_bytes_to_index =
-	str_undefined_offset_in_bytes_to_index;
-    encoding->methods.transcode_to_utf16 =
-	str_undefined_transcode_to_utf16;
-    encoding->methods.transcode_from_utf16 =
-	str_undefined_transcode_from_utf16;
-
     switch (rb_encoding_type) {
 	case ENCODING_TYPE_SPECIAL:
 	    break;

Modified: MacRuby/trunk/encoding.h
===================================================================
--- MacRuby/trunk/encoding.h	2010-12-18 00:14:02 UTC (rev 5048)
+++ MacRuby/trunk/encoding.h	2010-12-18 02:39:42 UTC (rev 5049)
@@ -110,18 +110,6 @@
     long end_offset_in_bytes;
 } character_boundaries_t;
 
-typedef struct {
-    void (*update_flags)(rb_str_t *);
-    void (*make_data_binary)(rb_str_t *);
-    bool (*try_making_data_uchars)(rb_str_t *);
-    long (*length)(rb_str_t *, bool);
-    long (*bytesize)(rb_str_t *);
-    character_boundaries_t (*get_character_boundaries)(rb_str_t *, long, bool);
-    long (*offset_in_bytes_to_index)(rb_str_t *, long, bool);
-    void (*transcode_to_utf16)(struct rb_encoding *, rb_str_t *, long *, UChar **, long *);
-    void (*transcode_from_utf16)(struct rb_encoding *, UChar *, long, long *, char **, long *);
-} encoding_methods_t;
-
 typedef struct rb_encoding {
     struct RBasic basic;
     unsigned int index;
@@ -131,7 +119,6 @@
     unsigned char min_char_size;
     bool single_byte_encoding : 1;
     bool ascii_compatible : 1;
-    encoding_methods_t methods;
     void *private_data;
 } rb_encoding_t;
 

Added: MacRuby/trunk/encoding_ucnv.h
===================================================================
--- MacRuby/trunk/encoding_ucnv.h	                        (rev 0)
+++ MacRuby/trunk/encoding_ucnv.h	2010-12-18 02:39:42 UTC (rev 5049)
@@ -0,0 +1,38 @@
+/* 
+ * MacRuby implementation of Ruby 1.9 String.
+ *
+ * This file is covered by the Ruby license. See COPYING for more details.
+ * 
+ * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
+ * Copyright (C) 1993-2007 Yukihiro Matsumoto
+ * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
+ * Copyright (C) 2000 Information-technology Promotion Agency, Japan
+ */
+
+#ifndef __UCNV_H_
+#define __UCNV_H_
+
+#include "encoding.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef void (^each_char_callback_t)(UChar32 c, const char* character_start, long character_length, bool *stop);
+
+void str_ucnv_update_flags(rb_str_t *self);
+void str_ucnv_make_data_binary(rb_str_t *self);
+bool str_ucnv_try_making_data_uchars(rb_str_t *self);
+long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
+long str_ucnv_bytesize(rb_str_t *self);
+character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
+long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
+void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
+void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
+void str_ucnv_each_char(rb_str_t *self, each_char_callback_t callback);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif /* __UCNV_H_ */

Modified: MacRuby/trunk/rakelib/builder/options.rb
===================================================================
--- MacRuby/trunk/rakelib/builder/options.rb	2010-12-18 00:14:02 UTC (rev 5048)
+++ MacRuby/trunk/rakelib/builder/options.rb	2010-12-18 02:39:42 UTC (rev 5049)
@@ -152,8 +152,8 @@
     sdk = opt.delete(:sdk)
     has_libauto = sdk ? File.exist?("#{sdk}/usr/lib/libauto.dylib") : true
     archflags = archs.map { |x| "-arch #{x}" }.join(' ')
-    @cflags = "-std=c99 -I. -I./include -fno-common -pipe -g -Wall -fexceptions -O#{OPTZ_LEVEL} -Wno-deprecated-declarations -Werror #{archflags}"
-    @cxxflags = "-I. -I./include -g -Wall -Wno-deprecated-declarations -Werror #{archflags}"
+    @cflags = "-std=c99 -I. -I./include -pipe -fno-common -fexceptions -fblocks -g -O#{OPTZ_LEVEL} -Wall -Wno-deprecated-declarations -Werror #{archflags}"
+    @cxxflags = "-I. -I./include -fblocks -g -Wall -Wno-deprecated-declarations -Werror #{archflags}"
     @ldflags = '-lpthread -ldl -lxml2 -lobjc -licucore -framework Foundation'
     @ldflags << " -lauto" if has_libauto
     if opt.delete(:static)

Modified: MacRuby/trunk/string.c
===================================================================
--- MacRuby/trunk/string.c	2010-12-18 00:14:02 UTC (rev 5048)
+++ MacRuby/trunk/string.c	2010-12-18 02:39:42 UTC (rev 5049)
@@ -23,6 +23,7 @@
 #include "ruby/node.h"
 #include "vm.h"
 #include "class.h"
+#include "encoding_ucnv.h"
 
 #include <unicode/unum.h>
 #include <unicode/utrans.h>
@@ -138,7 +139,7 @@
 	str_update_flags_utf16(self);
     }
     else {
-	self->encoding->methods.update_flags(self);
+	str_ucnv_update_flags(self);
     }
 }
 
@@ -387,7 +388,7 @@
 	return;
     }
 
-    self->encoding->methods.make_data_binary(self);
+    str_ucnv_make_data_binary(self);
 }
 
 static bool
@@ -418,7 +419,7 @@
 	return false;
     }
 
-    return self->encoding->methods.try_making_data_uchars(self);
+    return str_ucnv_try_making_data_uchars(self);
 }
 
 static void
@@ -469,11 +470,52 @@
 	    return div_round_up(self->length_in_bytes, 2);
 	}
 	else {
-	    return self->encoding->methods.length(self, ucs2_mode);
+	    return str_ucnv_length(self, ucs2_mode);
 	}
     }
 }
 
+static void
+str_each_char(rb_str_t *self, each_char_callback_t callback)
+{
+    if (str_is_stored_in_uchars(self)) {
+	bool stop = false;
+	long length = BYTES_TO_UCHARS(self->length_in_bytes);
+	for (long i = 0; i < length;) {
+	    UChar32 c;
+	    long old_i = i;
+	    U16_NEXT(self->data.uchars, i, length, c);
+	    callback(c, (const char *)&self->data.uchars[old_i],
+		    UCHARS_TO_BYTES(old_i-i), &stop);
+	    if (stop) {
+		return;
+	    }
+	};
+    }
+    else if (BINARY_ENC(self->encoding)
+	    || (self->encoding == rb_encodings[ENCODING_ASCII])) {
+	const uint8_t *pos = (uint8_t*)self->data.bytes;
+	const uint8_t *end = pos + self->length_in_bytes;
+	bool stop = false;
+	for (; pos < end; ++pos) {
+	    UChar32 c;
+	    if (*pos > 127) {
+		c = U_SENTINEL;
+	    }
+	    else {
+		c = *pos;
+	    }
+	    callback(c, (const char *)pos, 1, &stop);
+	    if (stop) {
+		return;
+	    }
+	}
+    }
+    else {
+	str_ucnv_each_char(self, callback);
+    }
+}
+
 static UChar
 str_get_uchar(rb_str_t *self, long pos, bool ucs2_mode)
 {
@@ -494,7 +536,7 @@
 	    return self->length_in_bytes;
 	}
 	else {
-	    return self->encoding->methods.bytesize(self);
+	    return str_ucnv_bytesize(self);
 	}
     }
     else {
@@ -654,7 +696,7 @@
 		+ 2;
 	}
 	else {
-	    boundaries = self->encoding->methods.get_character_boundaries(self,
+	    boundaries = str_ucnv_get_character_boundaries(self,
 		    index, ucs2_mode);
 	}
     }
@@ -1032,7 +1074,7 @@
 	    return BYTES_TO_UCHARS(offset_in_bytes);
 	}
 	else {
-	    return self->encoding->methods.offset_in_bytes_to_index(self,
+	    return str_ucnv_offset_in_bytes_to_index(self,
 		    offset_in_bytes, ucs2_mode);
 	}
     }
@@ -1362,7 +1404,7 @@
 	    pos_in_src = self->length_in_bytes;
 	}
 	else {
-	    src_encoding_used->methods.transcode_to_utf16(src_encoding_used,
+	    str_ucnv_transcode_to_utf16(src_encoding_used,
 		    self, &pos_in_src, &utf16, &utf16_length);
 	}
 
@@ -1441,7 +1483,7 @@
 	    for (;;) {
 		long bytes_length;
 		char *bytes;
-		dst_encoding_used->methods.transcode_from_utf16(dst_encoding_used,
+		str_ucnv_transcode_from_utf16(dst_encoding_used,
 			utf16, utf16_length, &utf16_pos, &bytes, &bytes_length);
 		if (bytes_length > 0) {
 		    str_concat_bytes(dst_str, bytes, bytes_length);
@@ -2785,7 +2827,8 @@
     VALUE result;
     if (len == 0) {
 	result = rb_str_new2("\"\"");
-	goto bail;
+	OBJ_INFECT(result, str);
+	return result;
     }
 
     // Allocate an UTF-8 string with a good initial capacity.
@@ -2794,31 +2837,18 @@
 	BINARY_ENC(str->encoding) ? (len * 5) + 2 : len + 2;
     result = rb_unicode_str_new(NULL, result_init_len);
 
-#define GET_UCHAR(pos) \
-    ((uchars \
-      ? str->data.uchars[pos] : (unsigned char)str->data.bytes[pos]))
-
     inspect_append(result, '"', false);
-    for (long i = 0; i < len; i++) {
-	const UChar c = GET_UCHAR(i);
-
-	bool print;
-	if (uchars) {
-	    print = iswprint(c);
+    __block UChar32 prev = 0;
+    str_each_char(str, ^(UChar32 c, const char* char_start, long char_len, bool *stop) {
+	bool print = iswprint(c);
+	if (dump && prev == '#') {
+	    inspect_append(result, prev, (c == '$' || c == '@' || c == '{'));
 	}
-	else { // ASCII printable characters
-	    print = ((c >= 0x20) && (c <= 0x7E));
-	}
 	if (print) {
 	    if (c == '"' || c == '\\') {
 		inspect_append(result, c, true);
 	    }
-	    else if (dump && c == '#' && i + 1 < len) {
-		const UChar c2 = GET_UCHAR(i + 1);
-		const bool need_escape = c2 == '$' || c2 == '@' || c2 == '{';
-		inspect_append(result, c, need_escape);
-	    }
-	    else {
+	    else if (c != '#' || !dump) {
 		inspect_append(result, c, false);
 	    }
 	}
@@ -2848,19 +2878,23 @@
 	}
 	else {
 	    char buf[10];
-	    snprintf(buf, sizeof buf, "\\x%02X", c);
-	    char *p = buf;
-	    while (*p != '\0') {
-		inspect_append(result, *p, false);
-		p++;
+	    for (long i = 0; i < char_len; ++i) {
+		uint8_t byte = (uint8_t)char_start[i];
+		snprintf(buf, sizeof buf, "\\x%02X", byte);
+		char *p = buf;
+		while (*p != '\0') {
+		    inspect_append(result, *p, false);
+		    p++;
+		}
 	    }
 	}
+	prev = c;
+    });
+    if (dump && prev == '#') {
+	inspect_append(result, prev, false);
     }
     inspect_append(result, '"', false);
    
-#undef GET_UCHAR
-
-bail:
     OBJ_INFECT(result, str);
     return result; 
 }

Modified: MacRuby/trunk/ucnv.c
===================================================================
--- MacRuby/trunk/ucnv.c	2010-12-18 00:14:02 UTC (rev 5048)
+++ MacRuby/trunk/ucnv.c	2010-12-18 02:39:42 UTC (rev 5049)
@@ -10,7 +10,7 @@
  */
 
 #include "ruby/macruby.h"
-#include "encoding.h"
+#include "encoding_ucnv.h"
 #include "unicode/ucnv.h"
 
 // do not forget to close the converter
@@ -28,7 +28,7 @@
 	); \
     ucnv_reset(cnv);
 
-static void
+void
 str_ucnv_update_flags(rb_str_t *self)
 {
     assert(!str_is_stored_in_uchars(self));
@@ -73,7 +73,7 @@
     str_set_ascii_only(self, ascii_only);
 }
 
-static void
+void
 str_ucnv_make_data_binary(rb_str_t *self)
 {
     assert(str_is_stored_in_uchars(self));
@@ -129,7 +129,7 @@
     return approximation;
 }
 
-static bool
+bool
 str_ucnv_try_making_data_uchars(rb_str_t *self)
 {
     assert(!str_is_stored_in_uchars(self));
@@ -175,7 +175,7 @@
     }
 }
 
-static long
+long
 str_ucnv_length(rb_str_t *self, bool ucs2_mode)
 {
     assert(!str_is_stored_in_uchars(self));
@@ -218,8 +218,59 @@
     return len;
 }
 
+
+void rb_ensure_b(void (^b_block)(void), void (^e_block)(void));
+
+void
+str_ucnv_each_char(rb_str_t *self, each_char_callback_t callback)
+{
+    assert(!str_is_stored_in_uchars(self));
+
+    USE_CONVERTER(cnv, self->encoding);
+
+    rb_ensure_b(^{
+	const char *pos = self->data.bytes;
+	const char *end = pos + self->length_in_bytes;
+	bool stop = false;
+	for (;;) {
+	    const char *char_start_pos = pos;
+	    // iterate through the string one Unicode code point at a time
+	    UErrorCode err = U_ZERO_ERROR;
+	    UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err);
+	    if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
+		// end of the string
+		break;
+	    }
+	    else if (U_FAILURE(err)) {
+		long min_char_size = self->encoding->min_char_size;
+		while (char_start_pos < pos) {
+		    long char_len = pos - char_start_pos;
+		    if (char_len > min_char_size) {
+			char_len = min_char_size;
+		    }
+		    callback(U_SENTINEL, char_start_pos, char_len, &stop);
+		    if (stop) {
+			return;
+		    }
+		    char_start_pos += char_len;
+		}
+	    }
+	    else {
+		long char_len = pos - char_start_pos;
+		callback(c, char_start_pos, char_len, &stop);
+		if (stop) {
+		    return;
+		}
+	    }
+	}
+    }, ^{
+	ucnv_close(cnv);
+    });
+}
+
+
 #define STACK_BUFFER_SIZE 1024
-static long
+long
 str_ucnv_bytesize(rb_str_t *self)
 {
     assert(str_is_stored_in_uchars(self));
@@ -254,7 +305,7 @@
     return len;
 }
 
-static character_boundaries_t
+character_boundaries_t
 str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode)
 {
     assert(!str_is_stored_in_uchars(self));
@@ -353,7 +404,7 @@
     return boundaries;
 }
 
-static long
+long
 str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
 	bool ucs2_mode)
 {
@@ -410,7 +461,7 @@
     return index;
 }
 
-static void
+void
 str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc,
 	rb_str_t *self, long *pos,
 	UChar **utf16, long *utf16_length)
@@ -452,7 +503,7 @@
     }
 }
 
-static void
+void
 str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc,
 	UChar *utf16, long utf16_length, long *utf16_pos,
 	char **bytes, long *bytes_length)
@@ -504,17 +555,4 @@
 
     // fill the fields not filled yet
     encoding->private_data = converter;
-    encoding->methods.update_flags = str_ucnv_update_flags;
-    encoding->methods.make_data_binary = str_ucnv_make_data_binary;
-    encoding->methods.try_making_data_uchars = str_ucnv_try_making_data_uchars;
-    encoding->methods.length = str_ucnv_length;
-    encoding->methods.bytesize = str_ucnv_bytesize;
-    encoding->methods.get_character_boundaries =
-	str_ucnv_get_character_boundaries;
-    encoding->methods.offset_in_bytes_to_index =
-	str_ucnv_offset_in_bytes_to_index;
-    encoding->methods.transcode_to_utf16 =
-	str_ucnv_transcode_to_utf16;
-    encoding->methods.transcode_from_utf16 =
-	str_ucnv_transcode_from_utf16;
 }

Modified: MacRuby/trunk/vm.cpp
===================================================================
--- MacRuby/trunk/vm.cpp	2010-12-18 00:14:02 UTC (rev 5048)
+++ MacRuby/trunk/vm.cpp	2010-12-18 02:39:42 UTC (rev 5049)
@@ -3539,6 +3539,21 @@
 
 extern "C"
 void
+rb_ensure_b(void (^b_block)(void), void (^e_block)(void))
+{
+    struct Finally {
+	void (^e_block)(void);
+	Finally(void (^_e_block)(void)) {
+	    e_block = _e_block;
+	}
+	~Finally() { e_block(); }
+    } finalizer(e_block);
+
+    b_block();
+}
+
+extern "C"
+void
 rb_vm_break(VALUE val)
 {
 #if 0
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.macosforge.org/pipermail/macruby-changes/attachments/20101217/8ea6d6dc/attachment-0001.html>