| 1 | |
|---|
| 2 | |
|---|
| 3 | |
|---|
| 4 | |
|---|
| 5 | |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | use strict; |
|---|
| 9 | use warnings; |
|---|
| 10 | |
|---|
| 11 | use DBI; |
|---|
| 12 | use Convert::EastAsianWidth; |
|---|
| 13 | use Encode; |
|---|
| 14 | use Storable qw/dclone/; |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | my @kana = qw( |
|---|
| 18 | xa a xi i xu u xe e xo o ka ga ki gi ku |
|---|
| 19 | gu ke ge ko go sa za shi ji su zu se ze so zo ta |
|---|
| 20 | da chi ji xtu tsu zu te de to do na ni nu ne no ha |
|---|
| 21 | ba pa hi bi pi fu bu pu he be pe ho bo po ma mi |
|---|
| 22 | mu me mo xya ya xyu yu xyo yo ra ri ru re ro xwa wa |
|---|
| 23 | wi we wo n vu ka ke |
|---|
| 24 | ); |
|---|
| 25 | |
|---|
| 26 | my $dbh = DBI->connect('dbi:mysql:pokedex', 'perl', ''); |
|---|
| 27 | my $rows_ref = $dbh->selectall_arrayref('SELECT id, name, name_jp, name_romaji FROM pokemon', { Slice => {} }); |
|---|
| 28 | my $sth = $dbh->prepare('UPDATE pokemon SET name_romaji = ? WHERE id = ?'); |
|---|
| 29 | |
|---|
| 30 | my $mismatches = 0; |
|---|
| 31 | |
|---|
| 32 | for my $row (@$rows_ref) { |
|---|
| 33 | if (not $row->{name_jp}) { |
|---|
| 34 | warn "$row->{name} has no kana!"; |
|---|
| 35 | next; |
|---|
| 36 | } |
|---|
| 37 | |
|---|
| 38 | my $converted_romaji = transliterate($row->{name_jp}); |
|---|
| 39 | |
|---|
| 40 | |
|---|
| 41 | $converted_romaji =~ s/\x{2640}/ F/g; |
|---|
| 42 | $converted_romaji =~ s/\x{2642}/ M/g; |
|---|
| 43 | |
|---|
| 44 | |
|---|
| 45 | $converted_romaji =~ s/ \b (\w) /\u$1/gx; |
|---|
| 46 | |
|---|
| 47 | if ($converted_romaji =~ /[^\x20-\xffx]/) { |
|---|
| 48 | print "!!! NON-ASCII CHARACTERS DETECTED FOR $row->{name}; SKIPPING\n"; |
|---|
| 49 | next; |
|---|
| 50 | } |
|---|
| 51 | |
|---|
| 52 | if ($row->{name_romaji} ne $converted_romaji) { |
|---|
| 53 | $mismatches++; |
|---|
| 54 | $sth->execute($converted_romaji, $row->{id}); |
|---|
| 55 | printf "Converted %12s: %16s -> %16s\n", $row->{name}, $row->{name_romaji}, $converted_romaji; |
|---|
| 56 | } |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | print "$mismatches changed romaji\n"; |
|---|
| 60 | |
|---|
| 61 | |
|---|
| 62 | |
|---|
| 63 | |
|---|
| 64 | sub transliterate { |
|---|
| 65 | my ($bytes) = @_; |
|---|
| 66 | my $string = decode('utf8', $bytes); |
|---|
| 67 | $string = to_halfwidth($string); |
|---|
| 68 | my @chars = split //, $string; |
|---|
| 69 | |
|---|
| 70 | for my $char (@chars) { |
|---|
| 71 | my $codept = ord $char; |
|---|
| 72 | |
|---|
| 73 | if ($codept < 0x3041 or |
|---|
| 74 | ($codept > 0x3093 and $codept < 0x30a1) or |
|---|
| 75 | $codept > 0x30f6) |
|---|
| 76 | { |
|---|
| 77 | |
|---|
| 78 | $char = [ $char, '' ]; |
|---|
| 79 | next; |
|---|
| 80 | } |
|---|
| 81 | |
|---|
| 82 | my $idx = ($codept - 0x3041) % 96; |
|---|
| 83 | $char = [ $char, $kana[$idx] ]; |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | for my $i (1 .. $#chars) { |
|---|
| 87 | |
|---|
| 88 | if ($chars[$i][0] eq "\x{30fc}" and |
|---|
| 89 | $chars[$i][0] ne $chars[$i - 1][0] |
|---|
| 90 | ) { |
|---|
| 91 | ( $chars[$i][1] ) = ( $chars[$i - 1][1] =~ /([aeiou])h?$/ ); |
|---|
| 92 | } |
|---|
| 93 | } |
|---|
| 94 | |
|---|
| 95 | my $pos = $#chars; |
|---|
| 96 | while ($pos >= 0) { |
|---|
| 97 | |
|---|
| 98 | if ($chars[$pos][1] =~ /x[aeiou]/) { |
|---|
| 99 | my $vowel = $chars[$pos][1]; |
|---|
| 100 | $vowel =~ s/^x//; |
|---|
| 101 | my $newchar = $chars[$pos - 1][1]; |
|---|
| 102 | |
|---|
| 103 | if ($vowel eq substr $newchar, -1) { |
|---|
| 104 | |
|---|
| 105 | |
|---|
| 106 | $chars[$pos][1] = $vowel; |
|---|
| 107 | } else { |
|---|
| 108 | $newchar =~ s/[aeiou]$/$vowel/; |
|---|
| 109 | splice @chars, $pos - 1, 2, [ 'replaced', $newchar ]; |
|---|
| 110 | } |
|---|
| 111 | |
|---|
| 112 | $pos -= 2; |
|---|
| 113 | next; |
|---|
| 114 | } |
|---|
| 115 | |
|---|
| 116 | |
|---|
| 117 | |
|---|
| 118 | if ($chars[$pos][1] =~ /^xy/ and |
|---|
| 119 | $chars[$pos - 1][1] =~ /i$/) |
|---|
| 120 | { |
|---|
| 121 | my $prefix = substr $chars[$pos - 1][1], 0, -1; |
|---|
| 122 | my $suffix = substr $chars[$pos][1], 1; |
|---|
| 123 | |
|---|
| 124 | |
|---|
| 125 | $suffix = substr $suffix, 1 |
|---|
| 126 | if $prefix eq 'j' or $prefix eq 'sh' or $prefix eq 'ch'; |
|---|
| 127 | |
|---|
| 128 | my $newchar = $prefix . $suffix; |
|---|
| 129 | splice @chars, $pos - 1, 2, [ 'replaced', $newchar ]; |
|---|
| 130 | $pos -= 2; |
|---|
| 131 | next; |
|---|
| 132 | } |
|---|
| 133 | |
|---|
| 134 | |
|---|
| 135 | |
|---|
| 136 | if ($chars[$pos][1] eq 'xtu' and |
|---|
| 137 | $chars[$pos + 1][1] =~ /^[^xaeiou]/ and |
|---|
| 138 | $chars[$pos + 1][1] ne 'n') |
|---|
| 139 | { |
|---|
| 140 | my $newchar = |
|---|
| 141 | substr($chars[$pos + 1][1], 0, 1) . |
|---|
| 142 | $chars[$pos + 1][1]; |
|---|
| 143 | splice @chars, $pos, 2, [ 'replaced', $newchar ]; |
|---|
| 144 | |
|---|
| 145 | |
|---|
| 146 | $pos--; |
|---|
| 147 | next; |
|---|
| 148 | } |
|---|
| 149 | |
|---|
| 150 | $pos--; |
|---|
| 151 | } |
|---|
| 152 | |
|---|
| 153 | my $result = join '', map { $_->[1] || $_->[0] } @chars; |
|---|
| 154 | |
|---|
| 155 | |
|---|
| 156 | $result =~ tr[\x{ff21}-\x{ff3a}\x{ff41}-\x{ff5a}] |
|---|
| 157 | [A-Za-z]; |
|---|
| 158 | |
|---|
| 159 | return $result; |
|---|
| 160 | } |
|---|