#!perl
# Copyright (C) 2001-2007, The Perl Foundation.
# $Id: stringu.t 18533 2007-05-14 01:12:54Z chromatic $
use strict;
use warnings;
use lib qw( . lib ../lib ../../lib );
use Test::More;
use Parrot::Test tests => 25;
use Parrot::Config;
=head1 NAME
t/op/stringu.t - Unicode String Test
=head1 SYNOPSIS
% prove t/op/stringu.t
=head1 DESCRIPTION
Tests Parrot unicode string system.
=cut
pasm_output_is( <<'CODE', <<OUTPUT, "angstrom" );
getstdout P0
push P0, "utf8"
chr S0, 0x212B
print P0, S0
print P0, "\n"
end
CODE
\xe2\x84\xab
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "escaped angstrom" );
getstdout P0
push P0, "utf8"
set S0, unicode:"\x{212b}"
print S0
print "\n"
end
CODE
\xe2\x84\xab
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 2" );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\x{212b}"
print S0
print "\n"
end
CODE
aaaaaa\xe2\x84\xab
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 3" );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\x{212b}-aaaaaa"
print S0
print "\n"
end
CODE
aaaaaa\xe2\x84\xab-aaaaaa
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, 'escaped angstrom 3 \uhhhh' );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\u212b-aaaaaa"
print S0
print "\n"
end
CODE
aaaaaa\xe2\x84\xab-aaaaaa
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "MATHEMATICAL BOLD CAPITAL A" );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\x{1d400}-aaaaaa"
print S0
print "\n"
end
CODE
aaaaaa\xf0\x9d\x90\x80-aaaaaa
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, 'MATHEMATICAL BOLD CAPITAL A \U' );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\U0001d400-aaaaaa"
print S0
print "\n"
end
CODE
aaaaaa\xf0\x9d\x90\x80-aaaaaa
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "two upscales" );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\x{212b}-bbbbbb\x{1d400}-cccccc"
print S0
print "\n"
length I0, S0
print I0
print "\n"
end
CODE
aaaaaa\xe2\x84\xab-bbbbbb\xf0\x9d\x90\x80-cccccc
22
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "two upscales - don't downscale" );
getstdout P0
push P0, "utf8"
set S0, unicode:"aaaaaa\x{1d400}-bbbbbb\x{212b}-cccccc"
print S0
print "\n"
length I0, S0
print I0
print "\n"
end
CODE
aaaaaa\xf0\x9d\x90\x80-bbbbbb\xe2\x84\xab-cccccc
22
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, '\cX, \ooo' );
getstdout P0
push P0, "utf8"
set S0, "ok 1\cJ"
print S0
set S0, "ok 2\012"
print S0
set S0, "ok 3\12"
print S0
set S0, "ok 4\x0a"
print S0
set S0, "ok 5\xa"
print S0
end
CODE
ok 1
ok 2
ok 3
ok 4
ok 5
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u' );
set S0, "x\uy"
print "never\n"
end
CODE
/Illegal escape sequence in/
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u123' );
set S0, "x\u123y"
print "never\n"
end
CODE
/Illegal escape sequence in/
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \U123' );
set S0, "x\U123y"
print "never\n"
end
CODE
/Illegal escape sequence in/
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \x' );
set S0, "x\xy"
print "never\n"
end
CODE
/Illegal escape sequence in/
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
set S0, utf8:unicode:"«"
length I0, S0
print I0
print "\n"
print S0
print "\n"
end
CODE
1
\xc2\xab
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
set S0, utf8:unicode:"\xc2\xab"
length I0, S0
print I0
print "\n"
print S0
print "\n"
end
CODE
1
\xc2\xab
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
set S0, utf8:unicode:"\xf2\xab"
length I0, S0
print I0
print "\n"
print S0
print "\n"
end
CODE
/Malformed UTF-8 string/
OUTPUT
pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
set S0, ascii:"«"
length I0, S0
print I0
print "\n"
end
CODE
/Malformed string/
OUTPUT
pasm_output_is( <<'CODE', <<OUTPUT, "substr with a UTF8 replacement #36794" );
set S0, "AAAAAAAAAA\\u666"
set I0, 0x666
chr S1, I0
substr S0, 10, 5, S1
print S0
print "\n"
end
CODE
AAAAAAAAAA\xd9\xa6
OUTPUT
SKIP: {
skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
pir_output_is( <<'CODE', <<OUTPUT, "downcase changes string behind scenes" );
.sub main
.local string str
.local string rest
str = unicode:".xyz"
rest = substr str, 1
print rest
print "\n"
str = unicode:".xyz"
$S99 = downcase str
rest = substr str, 1
print rest
print "\n"
.end
CODE
xyz
xyz
OUTPUT
pir_output_is( <<'CODE', <<OUTPUT, "downcase asciish" );
.sub main
.local string str
.local string rest
str = unicode:".XYZ"
$S0 = downcase str
print $S0
print "\n"
.end
CODE
.xyz
OUTPUT
# escape does not produce utf8, just a raw sequence of chars
pir_output_is( <<"CODE", <<'OUTPUT', "escape utf16" );
.sub main
.local string s, t
.local int i
s = iso-8859-1:"T\xf6tsch"
i = find_charset "unicode"
s = trans_charset s, i
t = upcase s
escape t, t
print t
print "\\n"
.end
CODE
T\x{d6}TSCH
OUTPUT
}
# Tests for .CCLASS_WHITESPACE
pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_WHITESPACE in unicode" );
.sub main
.include 'cclass.pasm'
.local string s
s = unicode:" \t\u207babc\n\u2000\u2009"
$I9 = length s
$I0 = is_cclass .CCLASS_WHITESPACE, s, 0
print $I0
$I0 = is_cclass .CCLASS_WHITESPACE, s, 1
print $I0
$I0 = is_cclass .CCLASS_WHITESPACE, s, 2
print $I0
$I0 = find_not_cclass .CCLASS_WHITESPACE, s, 0, $I9
print $I0
$I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
print $I0
$I0 = find_cclass .CCLASS_WHITESPACE, s, $I0, $I9
print $I0
$I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
print $I0
print "\n"
.end
CODE
1102269
OUTPUT
SKIP: {
skip "Tests seem to fail on big endian machines with icu", 2 if $PConfig{byteorder} eq '4321';
# Tests for .CCLASS_NUMERIC
pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_NUMERIC in unicode" );
.sub main
.include 'cclass.pasm'
.local string s
s = unicode:"01\u207bxyz\u0660\u17e1\u19d9"
$I9 = length s
$I0 = is_cclass .CCLASS_NUMERIC, s, 0
print $I0
$I0 = is_cclass .CCLASS_NUMERIC, s, 1
print $I0
$I0 = is_cclass .CCLASS_NUMERIC, s, 2
print $I0
$I0 = find_not_cclass .CCLASS_NUMERIC, s, 0, $I9
print $I0
$I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
print $I0
$I0 = find_cclass .CCLASS_NUMERIC, s, $I0, $I9
print $I0
$I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
print $I0
print "\n"
.end
CODE
1102269
OUTPUT
# Concatenate unicode: with iso-8859-1; RT #39930 if no icu
pir_output_is(
<<'CODE', <<"OUTPUT", "Concat unicode with iso-8859-1", $PConfig{has_icu} ? () : ( todo => 'RT #39930' ) );
.sub main
$S0 = unicode:"A"
$S1 = ascii:"B"
$S2 = concat $S0, $S1
print $S2
print "\n"
$S0 = unicode:"A"
$S1 = unicode:"B"
$S2 = concat $S0, $S1
print $S2
print "\n"
$S0 = unicode:"A"
$S1 = iso-8859-1:"B"
$S2 = concat $S0, $S1
print $S2
print "\n"
.end
CODE
AB
AB
A\x00B\x00
OUTPUT
}
# Local Variables:
# mode: cperl
# cperl-indent-level: 4
# fill-column: 100
# End:
# vim: expandtab shiftwidth=4:
syntax highlighted by Code2HTML, v. 0.9.1