#!/usr/bin/perl # # Written by Marc Liyanage (http://www.entropy.ch) # print utf82iso(join("", <>)); sub utf82iso { my ($string) = @_; my %cache = (); # Match 2-byte sequences with the first byte having # the high bits set to 110xxxxx (0xc0 - 0xdf) and the second byte having # the high bits set to 10xxxxxx (0x80 - 0xbf). Send these sequences to the # decode function and replace them with the result of the function. # $string =~ s/([\xc0-\xdf][\x80-\xbf])/$cache{$1} ||= decode_2byte_utf8($1)/eg; return $string; } sub decode_2byte_utf8 { # Split the incoming 2-character string into two separate # strings each containing only one character # my ($a, $b) = split(//, $_[0]); # Get the integer values of the two characters using the ord() function. # Then mask their UTF-8 high bit patterns (3 bits / 0x1f for the first byte, # 2 bits / 0x3f for the second) to get the 11-bit character number which # is now still spread over two bytes. # Shift the first byte to the left by 6 positions # and OR the two bytes together. Then take the rightmost # 8 bits (& 0xff), interpret them as character with the chr() # function and return that. # return chr((((ord($a) & 0x1f) << 6) | (ord($b) & 0x3f) & 0xff)); }