#!perl
# Copyright (C) 2001-2007, The Perl Foundation.
# $Id: string_cs.t 23554 2007-12-07 02:21:15Z coke $
use strict;
use warnings;
use lib qw( . lib ../lib ../../lib );
use Test::More;
use Parrot::Test tests => 50;
use Parrot::Config;
=head1 NAME
t/op/string_cs.t - String Charset Tests
=head1 SYNOPSIS
% prove t/op/string_cs.t
=head1 DESCRIPTION
Tests charset support.
=cut
pasm_output_is( <<'CODE', <<OUTPUT, "basic syntax" );
set S0, ascii:"ok 1\n"
print S0
set S0, binary:"ok 2\n"
print S0
set S0, iso-8859-1:"ok 3\n"
print S0
end
CODE
ok 1
ok 2
ok 3
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "charset name" );
set S0, "ok 1\n"
charset I0, S0
charsetname S1, I0
print S1
print "\n"
end
CODE
ascii
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "find_charset" );
find_charset I0, "iso-8859-1"
print "ok 1\n"
find_charset I0, "ascii"
print "ok 2\n"
find_charset I0, "binary"
print "ok 3\n"
end
CODE
ok 1
ok 2
ok 3
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, "find_charset - not existing" );
find_charset I0, "no_such"
end
CODE
/charset 'no_such' not found/
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "downcase" );
set S0, iso-8859-1:"AEIOU_ÄÖÜ\n"
downcase S1, S0
print S1
end
CODE
aeiou_äöü
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "upcase" );
set S0, iso-8859-1:"aeiou_äöüß\n"
upcase S1, S0
print S1
end
CODE
AEIOU_ÄÖÜß
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "titlecase" );
set S0, iso-8859-1:"zAEIOU_ÄÖÜ\n"
titlecase S1, S0
print S1
end
CODE
Zaeiou_äöü
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "is_whitespace" );
set S0, iso-8859-1:"a\t\n \xa0" # is 0xa0 a whitespace in iso-8859-1??
.include "cclass.pasm"
is_cclass I0, .CCLASS_WHITESPACE, S0, 0
is_cclass I1, .CCLASS_WHITESPACE, S0, 1
is_cclass I2, .CCLASS_WHITESPACE, S0, 2
is_cclass I3, .CCLASS_WHITESPACE, S0, 3
set I4, 4
is_cclass I4, .CCLASS_WHITESPACE, S0, I4
print I0
print I1
print I2
print I3
print I4
print "\n"
set S0, ascii:"a\t\n "
is_cclass I0, .CCLASS_WHITESPACE, S0, 0
is_cclass I1, .CCLASS_WHITESPACE, S0, 1
is_cclass I2, .CCLASS_WHITESPACE, S0, 2
is_cclass I3, .CCLASS_WHITESPACE, S0, 3
is_cclass I4, .CCLASS_WHITESPACE, S0, 4 # access past string boundary: not a whitespace
print I0
print I1
print I2
print I3
print I4
print "\n"
end
CODE
01111
01110
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "is_wordchar" );
.include "cclass.pasm"
set S0, "az019-,._"
length I1, S0
set I2, 0
lp:
is_cclass I0, .CCLASS_WORD, S0, I2
print I0
inc I2
lt I2, I1, lp
print "\n"
end
CODE
111110001
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "is_digit" );
.include "cclass.pasm"
set S0, "az019-,._"
length I1, S0
set I2, 0
lp:
is_cclass I0, .CCLASS_NUMERIC, S0, I2
print I0
inc I2
lt I2, I1, lp
print "\n"
end
CODE
001110000
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "is_punctuation" );
.include "cclass.pasm"
set S0, "az019-,._"
length I1, S0
set I2, 0
lp:
is_cclass I0, .CCLASS_PUNCTUATION, S0, I2
print I0
inc I2
lt I2, I1, lp
print "\n"
end
CODE
000001111
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "is_newline" );
.include "cclass.pasm"
set S0, "a\n"
is_cclass I0, .CCLASS_NEWLINE, S0, 0
print I0
is_cclass I0, .CCLASS_NEWLINE, S0, 1
print I0
print "\n"
end
CODE
01
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "find_wordchar" );
.include "cclass.pasm"
set S0, "_ ab 09"
set I0, 0
length I1, S0
lp:
find_cclass I0, .CCLASS_WORD, S0, I0, I1
print I0
print " "
eq I0, I1, done
inc I0
branch lp
done:
print "ok\n"
end
CODE
0 2 3 5 6 7 ok
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "find_digit" );
.include "cclass.pasm"
set S0, "_ ab 09"
set I0, 0
length I1, S0
lp:
find_cclass I0, .CCLASS_NUMERIC, S0, I0, I1
print I0
print " "
eq I0, I1, done
inc I0
branch lp
done:
print "ok\n"
end
CODE
5 6 7 ok
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "find_punctuation" );
.include "cclass.pasm"
set S0, "_ .b ,9"
set I0, 0
length I1, S0
lp:
find_cclass I0, .CCLASS_PUNCTUATION, S0, I0, I1
print I0
print " "
eq I0, I1, done
inc I0
branch lp
done:
print "ok\n"
end
CODE
0 2 5 7 ok
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i" );
set S0, "abc"
find_charset I0, "iso-8859-1"
trans_charset S1, S0, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
iso-8859-1
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i" );
set S1, "abc"
find_charset I0, "iso-8859-1"
trans_charset S1, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
iso-8859-1
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, "trans_charset_s_i - lossy" );
set S1, iso-8859-1:"abcä"
find_charset I0, "ascii"
trans_charset S1, I0
print "never\n"
end
CODE
/lossy conversion to ascii/
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i - same" );
set S1, ascii:"abc"
find_charset I0, "ascii"
trans_charset S1, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
ascii
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i iso-8859-1 to binary" );
set S0, iso-8859-1:"abc"
find_charset I0, "binary"
trans_charset S1, S0, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
binary
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i iso-8859-1 to binary" );
set S1, iso-8859-1:"abc"
find_charset I0, "binary"
trans_charset S1, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
binary
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i ascii to binary" );
set S0, ascii:"abc"
find_charset I0, "binary"
trans_charset S1, S0, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
binary
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i ascii to binary" );
set S1, ascii:"abc"
find_charset I0, "binary"
trans_charset S1, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
binary
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i ascii to iso-8859-1" );
set S0, ascii:"abc"
find_charset I0, "iso-8859-1"
trans_charset S1, S0, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
iso-8859-1
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i ascii to iso-8859-1" );
set S1, ascii:"abc"
find_charset I0, "iso-8859-1"
trans_charset S1, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
end
CODE
abc
iso-8859-1
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i iso-8859-1 to unicode" );
set S0, iso-8859-1:"abc_ä_"
find_charset I0, "unicode"
trans_charset S1, S0, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
length I2, S1
print I2
print "\n"
end
CODE
abc_\xc3\xa4_
unicode
6
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i unicode to iso-8859-1" );
set S0, unicode:"abc_\xe4_"
bytelength I2, S0 # XXX its 7 for utf8 only
print I2
print "\n"
find_charset I0, "iso-8859-1"
trans_charset S1, S0, I0
print S1
print "\n"
charset I0, S1
charsetname S2, I0
print S2
print "\n"
length I2, S1
print I2
print "\n"
end
CODE
7
abc_ä_
iso-8859-1
6
OUTPUT
pir_output_is( <<'CODE', <<'OUTPUT', "bug #34661 literal" );
.sub main :main
$S0 = unicode:"\"]\nif I3 == "
print "ok 1\n"
.end
CODE
ok 1
OUTPUT
pir_output_is( <<'CODE', <<'OUTPUT', "todo #34660 hash" );
.sub main :main
$P0 = new 'Integer'
$P0 = 42
store_global "Foo", unicode:"Bar", $P0
print "ok 1\n"
$P1 = find_global "Foo", "Bar"
print "ok 2\n"
print $P1
print "\n"
.end
CODE
ok 1
ok 2
42
OUTPUT
pir_output_is( <<'CODE', <<'OUTPUT', "concat ascii, utf8" );
.sub main
.local string s, t, u
s = "abcd"
t = unicode:"efg\n"
u = s . t
print u
s = unicode:"abcd"
t = "efg\n"
u = s . t
print u
.end
CODE
abcdefg
abcdefg
OUTPUT
SKIP: {
skip( 'no ICU lib', 16 ) unless $PConfig{has_icu};
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
set S0, iso-8859-1:"TÖTSCH"
find_charset I0, "unicode"
trans_charset S1, S0, I0
downcase S1
getstdout P0 # need to convert back to utf8
push P0, "utf8" # push utf8 output layer
print S1
print "\n"
end
CODE
t\xc3\xb6tsch
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_charset_s_i" );
set S0, iso-8859-1:"TÖTSCH"
find_charset I0, "unicode"
trans_charset S1, S0, I0
downcase S1
find_charset I0, "iso-8859-1"
trans_charset S1, I0
print S1
print "\n"
end
CODE
t\xf6tsch
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transcharset" );
set S0, iso-8859-1:"TÖTSCH"
find_charset I0, "unicode"
trans_charset S1, S0, I0
downcase S1
find_encoding I0, "utf8"
trans_encoding S2, S1, I0
print S2
print "\n"
end
CODE
t\xc3\xb6tsch
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
set S0, iso-8859-1:"TÖTSCH"
find_charset I0, "unicode"
trans_charset S1, S0, I0
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
length I1, S1
print I1
print "\n"
null I0
loop:
ord I2, S1, I0
print I2
print '_'
inc I0
lt I0, I1, loop
print "\n"
end
CODE
6
84_214_84_83_67_72_
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
set S0, iso-8859-1:"TTÖÖ"
find_charset I0, "unicode"
trans_charset S1, S0, I0
chopn S1, 2
print S1
print ' '
length I0, S1
print I0
print ' '
.include "stringinfo.pasm"
stringinfo I0, S1, .STRINGINFO_BUFUSED
print I0
print "\n"
end
CODE
TT 2 2
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
set S0, iso-8859-1:"Tötsch"
find_charset I0, "unicode"
trans_charset S1, S0, I0
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
concat S1, " Leo"
length I0, S1
print I0
print ' '
.include "stringinfo.pasm"
stringinfo I0, S1, .STRINGINFO_BUFUSED
print I0
print "\n"
find_encoding I0, "utf8"
trans_encoding S2, S1, I0
print S2
print "\n"
end
CODE
10 20
T\xc3\xb6tsch Leo
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
set S0, iso-8859-1:"Tötsch"
find_charset I0, "unicode"
trans_charset S1, S0, I0
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
concat S2, S1, " Leo"
length I0, S2
print I0
print ' '
.include "stringinfo.pasm"
stringinfo I0, S2, .STRINGINFO_BUFUSED
print I0
print "\n"
find_encoding I0, "utf8"
trans_encoding S2, S2, I0
print S2
print "\n"
end
CODE
10 20
T\xc3\xb6tsch Leo
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
set S0, iso-8859-1:"Tötsch"
find_charset I0, "unicode"
trans_charset S1, S0, I0
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
substr S2, S1, 1, 2
find_encoding I0, "utf8"
trans_encoding S2, S2, I0
print S2
print "\n"
end
CODE
\xc3\xb6t
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
set S0, iso-8859-1:"Tötsch"
find_charset I0, "unicode"
trans_charset S1, S0, I0
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
substr S2, S1, 1, 1, "oe"
find_encoding I0, "utf8"
trans_encoding S2, S2, I0
trans_encoding S1, S1, I0
print S2
print "\n"
print S1
print "\n"
end
CODE
\xc3\xb6
Toetsch
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
set S0, iso-8859-1:"TÖTSCH"
find_charset I0, "unicode"
trans_charset S1, S0, I0
downcase S1
set S2, iso-8859-1:"öt"
index I0, S1, S2
print I0
print "\n"
end
CODE
1
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
set S0, iso-8859-1:"TÖTSCH"
find_charset I0, "unicode"
trans_charset S1, S0, I0
downcase S1
set S2, iso-8859-1:"öt"
index I0, S1, S2
print I0
print "\n"
concat S1, S2
index I0, S1, S2, 2
print I0
print "\n"
end
CODE
1
6
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode upcase" );
set S0, iso-8859-1:"tötsch"
find_charset I0, "unicode"
trans_charset S1, S0, I0
upcase S1
getstdout P0 # need to convert back to utf8
push P0, "utf8" # push utf8 output layer
print S1
print "\n"
end
CODE
T\x{c3}\x{96}TSCH
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char" );
set S1, unicode:"hacek j \u01f0"
upcase S1
getstdout P0 # need to convert back to utf8
push P0, "utf8" # push utf8 output layer
print S1
print "\n"
end
CODE
HACEK J J\xcc\x8c
OUTPUT
# charset/unicode.c
#
# 106 dest_len = u_strToUpper(src->strstart, dest_len,
# (gdb) p src_len
# $17 = 7
# (gdb) p dest_len
# $18 = 7
# (gdb) x /8h src->strstart
# 0x844fb60: 0x005f 0x005f 0x005f 0x01f0 0x0031 0x0032 0x0033 0x0000
# (gdb) n
# 110 src->bufused = dest_len * sizeof(UChar);
# (gdb) p dest_len
# $19 = 8
# (gdb) x /8h src->strstart
# 0x844fb60: 0x005f 0x005f 0x005f 0x004a 0x030c 0x0031 0x0032 0x0000
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char 3.2 bug?" );
set S1, unicode:"___\u01f0123"
upcase S1
getstdout P0 # need to convert back to utf8
push P0, "utf8" # push utf8 output layer
print S1
print "\n"
end
CODE
___J\xcc\x8c123
OUTPUT
pasm_output_is( <<'CODE', <<"OUTPUT", "unicode titlecase" );
set S0, iso-8859-1:"tötsch leo"
find_charset I0, "unicode"
trans_charset S1, S0, I0
titlecase S1
getstdout P0 # need to convert back to utf8
push P0, "utf8" # push utf8 output layer
print S1
print "\n"
end
CODE
T\x{c3}\x{b6}tsch Leo
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "combose combined char" );
set S1, unicode:"___\u01f0___"
length I0, S1
upcase S1 # decompose J+hacek
length I1, S1 # 1 longer
downcase S1 # j+hacek
length I2, S1
compose S1, S1
length I3, S1 # back at original string
getstdout P0 # need to convert back to utf8
push P0, "utf8" # push utf8 output layer
print S1
print "\n"
print I0
print ' '
print I1
print ' '
print I2
print ' '
print I3
print "\n"
end
CODE
___\x{c7}\x{b0}___
7 8 8 7
OUTPUT
} # SKIP
pasm_output_is( <<'CODE', <<'OUTPUT', "escape ascii" );
set S0, "abcdefghi\n"
escape S1, S0
print S1
print "\n"
end
CODE
abcdefghi\n
OUTPUT
pasm_output_is( <<'CODE', <<'OUTPUT', "escape ctrl" );
set S0, "\x00\x01\x1f\x7f"
escape S1, S0
print S1
print "\n"
end
CODE
\x{0}\x{1}\x{1f}\x{7f}
OUTPUT
pasm_output_is( <<'CODE', <<'OUTPUT', "escape latin1" );
set S0, iso-8859-1:"tötsch leo"
escape S1, S0
print S1
print "\n"
end
CODE
t\x{f6}tsch leo
OUTPUT
pasm_output_is( <<'CODE', <<'OUTPUT', "escape unicode" );
set S0, unicode:"\u2001\u2002\u2003\u2004\x{e01ef}\u0114"
escape S1, S0
print S1
print "\n"
end
CODE
\u2001\u2002\u2003\u2004\x{e01ef}\u0114
OUTPUT
# Local Variables:
# mode: cperl
# cperl-indent-level: 4
# fill-column: 100
# End:
# vim: expandtab shiftwidth=4:
syntax highlighted by Code2HTML, v. 0.9.1