Revision: 3663 http://trac.macosforge.org/projects/ruby/changeset/3663 Author: lsansonetti@apple.com Date: 2010-03-01 23:00:25 -0800 (Mon, 01 Mar 2010) Log Message: ----------- unicode string formats (a work in progress) Modified Paths: -------------- MacRuby/branches/icu/encoding.h MacRuby/branches/icu/sprintf.c MacRuby/branches/icu/string.c Modified: MacRuby/branches/icu/encoding.h =================================================================== --- MacRuby/branches/icu/encoding.h 2010-03-02 06:50:51 UTC (rev 3662) +++ MacRuby/branches/icu/encoding.h 2010-03-02 07:00:25 UTC (rev 3663) @@ -302,6 +302,8 @@ UChar rb_str_get_uchar(VALUE str, long pos); void rb_str_append_uchar(VALUE str, UChar c); unsigned long rb_str_hash_uchars(const UChar *chars, long chars_len); +long rb_uchar_strtol(UniChar *chars, long chars_len, long pos, + long *end_offset); // Return a string object appropriate for bstr_ calls. This does nothing for // data/binary RubyStrings. Modified: MacRuby/branches/icu/sprintf.c =================================================================== --- MacRuby/branches/icu/sprintf.c 2010-03-02 06:50:51 UTC (rev 3662) +++ MacRuby/branches/icu/sprintf.c 2010-03-02 07:00:25 UTC (rev 3663) @@ -13,6 +13,7 @@ #include "ruby/ruby.h" #include "ruby/encoding.h" +#include "encoding.h" /* * call-seq: @@ -289,13 +290,10 @@ VALUE rb_enc_sprintf(rb_encoding *enc, const char *format, ...) { - VALUE result; va_list ap; - va_start(ap, format); - result = rb_enc_vsprintf(enc, format, ap); + VALUE result = rb_enc_vsprintf(enc, format, ap); va_end(ap); - return result; } @@ -308,13 +306,10 @@ VALUE rb_sprintf(const char *format, ...) { - VALUE result; va_list ap; - va_start(ap, format); - result = rb_vsprintf(format, ap); + VALUE result = rb_vsprintf(format, ap); va_end(ap); - return result; } @@ -356,42 +351,50 @@ width -= slen; do { CFStringInsert((CFMutableStringRef)arg, start, pad); - } while (--width > 0); + } + while (--width > 0); } static long -cstr_update(char **str, unsigned long start, unsigned long num, char *replace) +cstr_update(UChar **str, long *str_len, long start, long num, VALUE replace) { - unsigned long len = strlen(*str) + 1; - unsigned long replace_len = strlen(replace); + const long len = *str_len; + long replace_len = replace == 0 ? 0 : rb_str_chars_len(replace); if (start + num > len) { num = len - start; } if (replace_len >= num) { - char *new_str = (char *)xmalloc(len + replace_len - num); - memcpy(new_str, *str, len); - *str = new_str; + *str_len = len + replace_len - num; + *str = (UChar *)xrealloc(*str, + sizeof(UChar) * (len + replace_len - num)); } if (replace_len != num) { - bcopy(*str + start + num, *str + start + replace_len, len - start - - num); + bcopy(*str + start + num, *str + start + replace_len, + sizeof(UChar) * (len - start - num)); } if (replace_len > 0) { - bcopy(replace, *str + start, replace_len); + UChar *replace_chars = NULL; + bool need_free = false; + rb_str_get_uchars(replace, &replace_chars, &replace_len, &need_free); + assert(replace_len > 0); + bcopy(replace_chars, *str + start, sizeof(UChar) * replace_len); + if (need_free) { + free(replace_chars); + } } return replace_len - num; } -VALUE -get_named_arg(char *format_str, unsigned long format_len, unsigned long *i, +static VALUE +get_named_arg(UChar *format_str, long format_len, unsigned long *i, VALUE hash) { if (TYPE(hash) != T_HASH) { rb_raise(rb_eArgError, "hash required for named references"); } - char closing = format_str[(*i)++] + 2; - char *str_ptr = format_str + *i; + UChar closing = format_str[(*i)++] + 2; + UChar *str_ptr = &format_str[*i]; while (*i < format_len && format_str[*i] != closing) { (*i)++; } @@ -399,26 +402,32 @@ rb_raise(rb_eArgError, "malformed name - unmatched parenthesis"); } - format_str[*i] = '\0'; - hash = rb_hash_aref(hash, rb_name2sym(str_ptr)); - format_str[*i] = closing; - return (hash); + VALUE substr = rb_unicode_str_new(str_ptr, str_ptr - format_str); + hash = rb_hash_aref(hash, ID2SYM(rb_intern_str(substr))); + return hash; } -// XXX -// - this method uses strtol to read numbers from the format string, so -// extremely large numbers get silently truncated. this should be fixed -// - switch to a cfstring format string to allow for proper encoding support - // XXX look for arguments that are altered but not duped VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt) { bool tainted = OBJ_TAINTED(fmt); - fmt = rb_str_new3(fmt); - char *format_str = (char *)RSTRING_PTR(fmt); - unsigned long format_len = strlen(format_str); - long num; + + UChar *format_str = NULL; + long format_len = 0; + bool need_free = false; + rb_str_get_uchars(fmt, &format_str, &format_len, &need_free); + if (format_len == 0) { + goto bail; + } + UChar *tmp = (UChar *)xmalloc(format_len * sizeof(UChar)); + memcpy(tmp, format_str, format_len * sizeof(UChar)); + if (need_free) { + free(format_str); + } + format_str = tmp; + + long num, pos; int j = 0; int ref_type = 0; @@ -427,7 +436,7 @@ continue; } if (format_str[i + 1] == '%') { - cstr_update(&format_str, i, 1, (char *)""); + cstr_update(&format_str, &format_len, i, 1, 0); continue; } @@ -444,7 +453,6 @@ int base = 0; CFStringRef negative_pad = NULL; CFStringRef sharp_pad = CFSTR(""); - char *str_ptr; unsigned long start = i; while (i++ < format_len) { @@ -464,16 +472,17 @@ i--; break; } - num = strtol(format_str + i, &str_ptr, 10); - if (str_ptr == format_str + i--) { + + num = rb_uchar_strtol(format_str, format_len, i, &pos); + if (pos == i--) { SET_REF_TYPE(REL_REF); width = NUM2LONG(rb_Integer(GETNTHARG(j))); j++; } - else if (*str_ptr == '$') { + else if (format_str[pos] == '$') { SET_REF_TYPE(ABS_REF); width = NUM2LONG(rb_Integer(GETNTHARG(num - 1))); - i = str_ptr - format_str; + i = pos; } } if (width < 0) { @@ -513,9 +522,9 @@ case '7': case '8': case '9': - num = strtol(format_str + i, &str_ptr, 10); - i = str_ptr - format_str; - if (*str_ptr == '$') { + num = rb_uchar_strtol(format_str, format_len, i, &pos); + i = pos; + if (format_str[pos] == '$') { if (num == 0) { rb_raise(rb_eArgError, "invalid absolute argument"); } @@ -543,23 +552,26 @@ i--; break; } - num = strtol(format_str + i, &str_ptr, 10); - if (str_ptr == format_str + i--) { + + num = rb_uchar_strtol(format_str, format_len, + i, &pos); + if (num == i--) { SET_REF_TYPE(REL_REF); precision = NUM2LONG(rb_Integer(GETNTHARG(j))); j++; } - else if (*str_ptr == '$') { + else if (format_str[pos] == '$') { SET_REF_TYPE(ABS_REF); precision = NUM2LONG(rb_Integer(GETNTHARG( num - 1))); - i = str_ptr - format_str; + i = pos; } } } else if (isdigit(format_str[i])) { - precision = strtol(format_str + i, &str_ptr, 10); - i = str_ptr - format_str - 1; + precision = rb_uchar_strtol(format_str, format_len, + i, &pos); + i = pos - 1; } else { rb_raise(rb_eArgError, "invalid precision"); @@ -653,7 +665,7 @@ break; } - arg = rb_str_new(format_str + i, 1); + arg = rb_unicode_str_new(&format_str[i], 1); if (precision_flag) { rb_str_update(arg, 0, 0, rb_big2str(LONG2NUM(precision), 10)); @@ -677,9 +689,10 @@ } rb_str_update(arg, 0, 0, (VALUE)CFSTR("%")); - asprintf(&str_ptr, RSTRING_PTR(arg), value); - arg = rb_str_new2(str_ptr); - free(str_ptr); + char *ptr; + asprintf(&ptr, RSTRING_PTR(arg), value); + arg = rb_str_new2(ptr); + free(ptr); break; } @@ -732,8 +745,10 @@ arg = rb_big2str(num, base); if (!sign_pad && IS_NEG(num) && negative_pad != NULL) { - char neg = *RSTRING_PTR(negative_pad); - str_ptr = (char *)RSTRING_PTR(arg) + 1; + break; // TODO +#if 0 + UChar neg = CFStringGetCharacterAtIndex(negative_pad, 0); + char *str_ptr = (char *)RSTRING_PTR(arg) + 1; if (base == 8) { *str_ptr |= ((~0 << 3) >> ((3 * strlen(str_ptr)) % (sizeof(BDIGIT) * 8))) & ~(~0 << 3); @@ -744,11 +759,13 @@ rb_str_update(arg, 0, num_index, (VALUE)negative_pad); rb_str_update(arg, 0, 0, (VALUE)CFSTR("..")); num_index = 2; +#endif } if (precision_flag) { - pad_format_value(arg, num_index, precision + (IS_NEG(num) && - (sign_pad || negative_pad == NULL) ? 1 : 0), + pad_format_value(arg, num_index, + precision + (IS_NEG(num) + && (sign_pad || negative_pad == NULL) ? 1 : 0), zero_pad); } if (sharp_flag && rb_cmpint(num, Qfalse, Qfalse) != 0) { @@ -773,14 +790,17 @@ } pad_format_value(arg, minus_flag ? -1 : 0, width, CFSTR(" ")); - num = cstr_update(&format_str, start, i - start + 1, - (char *)RSTRING_PTR(arg)); - format_len += num; + num = cstr_update(&format_str, &format_len, start, i - start + 1, + arg); i += num; break; } } - fmt = rb_str_new2(format_str); - return tainted ? OBJ_TAINT(fmt) : fmt; +bail: + fmt = rb_unicode_str_new(format_str, format_len); + if (tainted) { + OBJ_TAINT(fmt); + } + return fmt; } Modified: MacRuby/branches/icu/string.c =================================================================== --- MacRuby/branches/icu/string.c 2010-03-02 06:50:51 UTC (rev 3662) +++ MacRuby/branches/icu/string.c 2010-03-02 07:00:25 UTC (rev 3663) @@ -22,6 +22,8 @@ #include "ruby/node.h" #include "vm.h" +#include <unicode/unum.h> + VALUE rb_cString; VALUE rb_cNSString; VALUE rb_cNSMutableString; @@ -4152,6 +4154,25 @@ } long +rb_uchar_strtol(UniChar *chars, long chars_len, long pos, long *end_offset) +{ + assert(chars != NULL && chars_len > 0 && pos >= 0); + + UErrorCode status = U_ZERO_ERROR; + UNumberFormat *nf = unum_open(UNUM_DEFAULT, NULL, -1, NULL, NULL, &status); + assert(nf != NULL); + + int32_t parse_pos = (int32_t)pos; + int64_t val = unum_parseInt64(nf, chars, chars_len, &parse_pos, &status); + unum_close(nf); + + if (end_offset != NULL) { + *end_offset = (long)parse_pos; + } + return val; +} + +long rb_memhash(const void *ptr, long len) { CFDataRef data = CFDataCreate(NULL, (const UInt8 *)ptr, len);