Introduction
Here is a complete code example that writes UTF-8 to an output file, then reads from that file and carries out a regular expression match followed by a substitution. I tested this with Perl 5.8.1 and Perl 5.8.6 under Mac OS X 10.3.9.
I learned a lot about Perl’s support of Unicode by working on this problem. For more information, read "perldoc perlunicode" and "perldoc perluniintro".
#!/usr/local/bin/perl
# unicodeTest.pl
# 20-Jul-2005
#
# Conrad Halling
# conrad.halling@sphaerula.com
require 5.8.1;
use strict;
use warnings;
use IO::File;
use Carp ();
binmode( STDOUT, ':utf8' );
# Write Unicode text to the output file.
# BBEdit reports that the file 'test.txt' is UTF-8 without BOM
# (byte order marks).
# $text includes the characters for curly double quotation marks.
my $text = "no \x{201C}Unicode\x{201D}";
my $fileName = "test.txt";
my $outFh = IO::File->new();
unless ( $outFh->open( $fileName, '>:utf8' ) )
{
Carp::confess( "Can't open file '$fileName' for writing: $!" );
}
$outFh->print( $text );
$outFh->close();
# Read the UniCode text from the file.
my $inFh = IO::File->new();
unless( $inFh->open( $fileName, '<:utf8' ) )
{
Carp::confess( "Can't open file '$fileName' for reading: $!" );
}
my $dataLine = $inFh->getline();
$inFh->close();
# Use regular expressions and substitutions on the UniCode string.
STDOUT->print( "Before: '$dataLine'\n" );
if ( $dataLine =~ m/no/ )
{
$dataLine =~ s/no/yes/g;
if ( $dataLine =~ /yes/ )
{
STDOUT->print( "After: '$dataLine'\n" );
}
}
exit( 0 );