テキストの正規化

全角英数字を半角にしたり,半角カナを全角にしたりする。

#! /usr/bin/ruby -w
# -*- coding: utf-8 -*-

# Ruby 1.8用
# $KCODE = 'u'
# require 'jcode'

while line = gets
  line.tr!('。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚', '。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚')
  line.tr!('゛゜', '゙゚')   # 309B-309C to 3099-309A
  # ゙ (U+3099, combining katakana-hiragana voiced sound mark)
  # ゚ (U+309A, combining katakana-hiragana semi-voiced sound mark)
  line.gsub!(/が/, 'が')
  line.gsub!(/ぎ/, 'ぎ')
  line.gsub!(/ぐ/, 'ぐ')
  line.gsub!(/げ/, 'げ')
  line.gsub!(/ご/, 'ご')
  line.gsub!(/ざ/, 'ざ')
  line.gsub!(/じ/, 'じ')
  line.gsub!(/ず/, 'ず')
  line.gsub!(/ぜ/, 'ぜ')
  line.gsub!(/ぞ/, 'ぞ')
  line.gsub!(/だ/, 'だ')
  line.gsub!(/ぢ/, 'ぢ')
  line.gsub!(/づ/, 'づ')
  line.gsub!(/で/, 'で')
  line.gsub!(/ど/, 'ど')
  line.gsub!(/ば/, 'ば')
  line.gsub!(/び/, 'び')
  line.gsub!(/ぶ/, 'ぶ')
  line.gsub!(/べ/, 'べ')
  line.gsub!(/ぼ/, 'ぼ')
  line.gsub!(/ぱ/, 'ぱ')
  line.gsub!(/ぴ/, 'ぴ')
  line.gsub!(/ぷ/, 'ぷ')
  line.gsub!(/ぺ/, 'ぺ')
  line.gsub!(/ぽ/, 'ぽ')
  line.gsub!(/ゔ/, 'ゔ')
  line.gsub!(/ゞ/, 'ゞ')
  line.gsub!(/ガ/, 'ガ')
  line.gsub!(/ギ/, 'ギ')
  line.gsub!(/グ/, 'グ')
  line.gsub!(/ゲ/, 'ゲ')
  line.gsub!(/ゴ/, 'ゴ')
  line.gsub!(/ザ/, 'ザ')
  line.gsub!(/ジ/, 'ジ')
  line.gsub!(/ズ/, 'ズ')
  line.gsub!(/ゼ/, 'ゼ')
  line.gsub!(/ゾ/, 'ゾ')
  line.gsub!(/ダ/, 'ダ')
  line.gsub!(/ヂ/, 'ヂ')
  line.gsub!(/ヅ/, 'ヅ')
  line.gsub!(/デ/, 'デ')
  line.gsub!(/ド/, 'ド')
  line.gsub!(/バ/, 'バ')
  line.gsub!(/ビ/, 'ビ')
  line.gsub!(/ブ/, 'ブ')
  line.gsub!(/ベ/, 'ベ')
  line.gsub!(/ボ/, 'ボ')
  line.gsub!(/パ/, 'パ')
  line.gsub!(/ピ/, 'ピ')
  line.gsub!(/プ/, 'プ')
  line.gsub!(/ペ/, 'ペ')
  line.gsub!(/ポ/, 'ポ')
  line.gsub!(/ヴ/, 'ヴ')
  line.gsub!(/ヷ/, 'ヷ')
  line.gsub!(/ヸ/, 'ヸ')
  line.gsub!(/ヹ/, 'ヹ')
  line.gsub!(/ヺ/, 'ヺ')
  line.gsub!(/ヾ/, 'ヾ')
  line.gsub!(/•/, '・')     # U+2022 (bullet) to U+30FB (katakana middle dot)
  line.gsub!(/∙/, '・')     # U+2219 (bullet operator) to U+30FB (katakana middle dot)
  line.gsub!(/~/, '〜')    # U+FF5E (fullwidth tilde) to U+301C (wave dash)
  line.tr!('0-9', '0-9')
  line.tr!('A-Z', 'A-Z')
  line.tr!('a-z', 'a-z')
  line.gsub!(/⋯/, '…')
  line.gsub!(/㎞/, "km")  # この類はほかにもたくさんありそう
  print line
end

Last modified: