#!/usr/bin/perl -w use strict; use Devel::Peek; use Encode; use POSIX qw(LC_CTYPE); # Find out what code page we're in, so we can properly translate file/directory encodings. our ($locale, $utf8_re_bits); { my $lc = POSIX::setlocale(LC_CTYPE) || 'C'; # If the locale is C or POSIX, that's ASCII - we'll set to iso-8859-1 # Otherwise, normalize the codeset part of the locale. if ($lc eq 'C' || $lc eq 'POSIX') { $lc = 'iso-8859-1'; } else { $lc = lc((split(/\./, $lc))[1]); } # Locale can end up with nothing, if it's invalid, such as "en_US" if (!defined $lc || $lc =~ /^\s*$/) { $lc = 'iso-8859-1'; } # Sometimes underscores can be aliases - Solaris $lc =~ s/_/-/g; # ISO encodings with 4 or more digits use a hyphen after "ISO" $lc =~ s/^iso(\d{4})/iso-$1/; # Special case ISO 2022 and 8859 to be nice $lc =~ s/^iso-(2022|8859)([^-])/iso-$1-$2/; $lc =~ s/utf-8/utf8/gi; # Create a regex for looks_like_utf8() $utf8_re_bits = join "|", map { latin1toUTF8(chr($_)) } (127..255); } sub utf8decode { return utf8decode_guess(@_); } sub utf8decode_guess { my $string = shift; my $prefer_encoding; # Bail early if it's just ascii if (looks_like_ascii($string)) { return $string; } my $orig = $string; if ($string && $] > 5.007 && !Encode::is_utf8($string)) { eval { my $icode = Encode::Guess::guess_encoding($string); if (ref $icode) { $string = Encode::decode($icode, $string, Encode::FB_QUIET()); } else { if ($icode !~ /^no /i) { while ($prefer_encoding = shift) { $string = Encode::decode($prefer_encoding, $string, Encode::FB_QUIET()); last if $icode =~ /$prefer_encoding/; } } } } } return $string; } sub utf8decode_locale { my $string = shift; if ($string && $] > 5.007 && !Encode::is_utf8($string)) { $string = Encode::decode($locale, $string, Encode::FB_QUIET()); } return $string; } sub utf8encode { my $string = shift; my $encoding = shift || 'utf8'; # Bail early if it's just ascii if (looks_like_ascii($string)) { return $string; } my $orig = $string; # Don't try to encode a string which isn't utf8 # # If the incoming string already is utf8, turn off the utf8 flag. if ($string && $] > 5.007 && !Encode::is_utf8($string)) { $string = Encode::encode($encoding, $string, Encode::FB_QUIET()); } elsif ($string && $] > 5.007) { Encode::_utf8_off($string); } # Check for doubly encoded strings - and revert back to our original # string if that's the case. if ($string && $] > 5.007 && !looks_like_utf8($string)) { $string = $orig; } return $string; } sub utf8encode_locale { return utf8encode($_[0], $locale); } sub utf8off { my $string = shift; if ($string && $] > 5.007) { Encode::_utf8_off($string); } return $string; } sub utf8on { my $string = shift; if ($string && $] > 5.007 && looks_like_utf8($string)) { Encode::_utf8_on($string); } return $string; } sub looks_like_ascii { use bytes; return 1 if $_[0] !~ /([^\x00-\x7F])/; return 0; } sub looks_like_latin1 { use bytes; return 1 if $_[0] !~ /([^\x00-\xFF])/; return 0; } sub looks_like_utf8 { use bytes; return 1 if $_[0] =~ /($utf8_re_bits)/o; return 0; } sub latin1toUTF8 { my $data = shift; if ($] > 5.007) { $data = eval { Encode::encode('utf8', $data, Encode::FB_QUIET()) } || $data; } else { $data =~ s/([\x80-\xFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg; } return $data; } sub utf8toLatin1 { my $data = shift; if ($] > 5.007) { $data = eval { Encode::encode('iso-8859-1', $data, Encode::FB_QUIET()) } || $data; } else { $data =~ s/([\xC0-\xDF])([\x80-\xBF])/chr(ord($1)<<6&0xC0|ord($2)&0x3F)/eg; $data =~ s/[\xE2][\x80][\x99]/'/g; } return $data; } sub encodingFromString { my $encoding = 'raw'; # Don't copy a potentially large string - just read it from the stack. if (looks_like_ascii($_[0])) { $encoding = 'ascii'; } elsif (looks_like_utf8($_[0])) { $encoding = 'utf8'; } elsif (looks_like_latin1($_[0])) { $encoding = 'iso-8859-1'; } return $encoding; } sub main { opendir(DIR, ($ARGV[0] || '.')); while (my $entry = readdir(DIR)) { my $enc = encodingFromString($entry); if ($enc ne 'ascii') { print "encoding: [$enc]\n"; print Dump($entry); } } closedir(DIR); } main();