namespace {
void utfchar(const string& from, string::size_type pos, string& to) {
- string::value_type first = from[pos];
+ unsigned first = from[pos];
if ((first & 0x80) == 0)
to = from[pos];
else {
string::size_type len = 0;
while (first & 0x80) {
++len;
- first <<= 1;
- }
+ first <<= 1;
+ }
to = from.substr(pos, len);
}
}
romaji["プ"] = "pu";
romaji["ペ"] = "pe";
romaji["ポ"] = "po";
- romaji["ã\83¼"] = "";
+ romaji["ã\83»"] = " ";
// -- double width letters ------
romaji["A"] = "A";
romaji["$"] = "$";
romaji["%"] = "%";
romaji["&"] = "&";
- romaji["'"] = "'"; // TODO:
- romaji["("] = "(";
- romaji[")"] = ")";
+ romaji["'"] = "'";
romaji["*"] = "*";
romaji["+"] = "+";
romaji[","] = ",";
romaji["?"] = "?";
romaji["@"] = "@";
+ romaji["〔"] = "(";
+ romaji["〕"] = ")";
+ romaji["("] = "(";
+ romaji[")"] = ")";
romaji["["] = "[";
- romaji["\"] = "\\";
romaji["]"] = "]";
+ romaji["【"] = "[";
+ romaji["】"] = "]";
+ romaji["{"] = "{";
+ romaji["}"] = "}";
+ romaji["\"] = "\\";
romaji["^"] = "^";
romaji["_"] = "_";
romaji["`"] = "`";
-
- romaji["{"] = "{";
romaji["|"] = "|";
- romaji["}"] = "}";
romaji["~"] = "~";
-
-
- // don't know where those belong to
+ romaji["ー"] = "";
+ romaji["。"] = ".";
+ romaji["、"] = ",";
romaji["〜"] = "~";
- romaji["、"] = ","; // TODO:
romaji["−"] = "-";
-
- romaji[" "] = " ";
romaji["―"] = "-";
- romaji["・"] = "-"; // FIXME
+ romaji[" "] = " ";
+
+ romaji["ー"] = "\3";
+}
+
+void remove_quote_1(
+ string::size_type const pos,
+ string &rom)
+{
+ // if we encounter something like
+ // "ki" + '\1'
+ // remove the previous character of \1 and the \1
+ // if \1 is followed by an 'y' remove that also
+ rom.erase(
+ pos - 1,
+ (pos + 1 < rom.size()
+ && rom[pos + 1] == 'y'
+ )
+ ? 3
+ : 2);
}
void kana2romaji(const string& kana, string& rom) {
rom.clear();
for (string::size_type pos = 0; pos < kana.size(); ) {
- string ch;
+ string ch;
utfchar(kana, pos, ch);
romaji_map::const_iterator trans = romaji.find(ch);
if (trans == romaji.end()) {
rom += ch;
if (ch.size() > 1)
- cout << "Don't know how to translate '" << ch << "' in '" << kana << "' to romaji." << endl;
+ cout << "Don't know how to translate '" << ch << "' in '" << kana << "' to romaji.\n";
}
else
rom += trans->second;
}
for (string::size_type pos = 0; pos < rom.size(); ++pos)
if (rom[pos] == '\1') {
- string::size_type from = pos, count = 1;
- if (pos > 1 && (rom[pos - 2] == 'h' || rom[pos - 2] == 'j')) {
- --from;
- count = (pos + 1 < rom.size() && rom[pos + 1] == 'y') ? 3 : 2;
+
+ if (pos > 2) {
+ string const pred = rom.substr(pos - 3, 3);
+ if(pred == "chi" ||
+ pred == "shi" ||
+ pred == "dzi" ||
+ pred == "tsu" ||
+ pred == "shi"
+ ) {
+ remove_quote_1(pos, rom);
+ pos -= 2;
+ continue;
+ }
+ }
+ if (pos > 1) {
+
+ string const pred = rom.substr(pos - 2, 2);
+ if(pred == "ki" ||
+ pred == "ni" ||
+ pred == "mi" ||
+ pred == "ri" ||
+ pred == "gi" ||
+ pred == "ji" ||
+ pred == "hi" ||
+ pred == "bi" ||
+ pred == "pi"
+ )
+ {
+ // shorten "ji\1y" to "j"
+ // otherwise remove "\1" and the preceding character
+ // but not the y
+ rom.erase(
+ pos - 1,
+ (pos + 1 < rom.size()
+ && rom[pos + 1] == 'y'
+ && pred[0] == 'j')
+ ? 3
+ : 2);
+ pos -= 2;
+ continue;
+ }
+ else if(
+ pred == "fu" ||
+ pred == "de" ||
+ pred == "te" ||
+ pred == "vu")
+ {
+ remove_quote_1(pos, rom);
+ pos -= 2;
+ continue;
+ }
+ else if(
+ pred == "su" ||
+ pred == "zu" ||
+ pred == "te" ||
+ pred == "de" ||
+ pred == "ku" ||
+ pred == "gu" ||
+ pred == "mu"
+ )
+ {
+ rom[pos - 1] = 'w';
+ rom.erase(pos, 1);
+ --pos;
+ continue;
+ }
+ else if(
+ pred == "to" ||
+ pred == "do"
+ )
+ {
+ rom[pos - 1] = 'h';
+ rom.erase(pos, 1);
+ --pos;
+ continue;
+ }
+ }
+
+ if (pos > 0) {
+ char const pred = rom[pos - 1];
+
+ switch(pred)
+ {
+ case 'a':
+ case 'i':
+ case 'u':
+ case 'e':
+ case 'o':
+ rom.erase(pos, 1);
+ --pos;
+ break;
+ default:
+ cout << "Encountered a special character in " << kana << " but don't know what to do with it.\n";
+ }
+ }
+ else
+ {
+ rom.erase(pos, 1);
+ --pos;
}
- rom.erase(from, count);
}
- else if (rom[pos] == '\2' && pos + 1 < rom.size())
+ else if (rom[pos] == '\2')
+ {
+ // two tsu may follow each other, so just remove them
+ if(pos + 1 < rom.size() && rom[pos + 1] != '\2')
rom[pos] = rom[pos + 1];
+ else
+ {
+ rom.erase(pos, 1);
+ --pos;
+ }
+ }
+ else if (rom[pos] == '\3')
+ {
+ if(pos == 0)
+ {
+ if(rom.size() == 1)
+ rom = "chouon";
+ else
+ {
+ cout << "ー is the first letter of " << kana << ". Don't know how to translate this.\n";
+ rom.erase(pos, 1);
+ --pos;
+ }
+ }
+ else
+ rom[pos] = rom[pos-1];
+ }
+
+ for (string::size_type pos = 0; pos < rom.size(); ++pos)
+ switch(rom[pos])
+ {
+ case '\1':
+ case '\2':
+ case '\3':
+ cout << "Failed to translate " << kana << '\n';
+ return;
+ }
}