Files
piradio-mini/translate_class.py
T
2017-06-04 19:34:33 +02:00

518 lines
16 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: latin-1 -*-
#
# Raspberry Pi Radio Character translation class
# Escaped characters, html and unicode translation to ascii
#
# $Id: translate_class.py,v 1.37 2017/05/16 11:48:02 bob Exp $
#
# Author : Bob Rathbone
# Site : http://www.bobrathbone.com
#
# License: GNU V3, See https://www.gnu.org/copyleft/gpl.html
#
# Disclaimer: Software is provided as is and absolutly no warranties are implied or given.
# The authors shall not be liable for any loss or damage however caused.
#
# Useful Links on character encodings
# http://www.zytrax.com/tech/web/entities.html
# http://www.utf8-chartable.de/
# http://www.codetable.net/
# http://www.ascii-code.com/
import os,sys
import time
import unicodedata
from log_class import Log
log = Log()
class Translate:
displayUmlauts = True
# Escaped codes (from unicode)
codes = {
'//' : '/', # Double /
' ' : ' ', # Double spaces
'\\n' : ' ', # Line feed to space
# German UTF8 codes
'\\xef\\xbf\\xbd' : chr(246),
# Currencies
'\\xe2\\x82\\xac' : ' Euro ',
# Special characters
'\\x80\\x99' : "'", # Single quote
'\\xc2\\xa1' : '!', # Inverted exclamation
'\\xc2\\xa2' : 'c', # Cent sign
'\\xc2\\xa3' : '#', # Pound sign
'\\xc2\\xa4' : '$', # Currency sign
'\\xc2\\xa5' : 'Y', # Yen sign
'\\xc2\\xa6' : '|', # Broken bar
'\\xc2\\xa7' : '?', # Section sign
'\\xc2\\xa8' : ':', # Diaerisis
'\\xc2\\xa9' : '(C)', # Copyright
'\\xc2\\xaa' : '?', # Feminal ordinal
'\\xc2\\xab' : '<<', # Double left
'\\xc2\\xac' : '-', # Not sign
'\\xc2\\xad' : '', # Soft hyphen
'\\xc2\\xae' : '(R)', # Registered sign
'\\xc2\\xaf' : '-', # Macron
'\\xc2\\xb0' : 'o', # Degrees sign
'\\xc2\\xb1' : '+-', # Plus minus
'\\xc2\\xb2' : '2', # Superscript 2
'\\xc2\\xb3' : '3', # Superscript 3
'\\xc2\\xb4' : '', # Acute accent
'\\xc2\\xb5' : 'u', # Micro sign
'\\xc2\\xb6' : '', # Pilcrow
'\\xc2\\xb7' : '.', # Middle dot
'\\xc2\\xb8' : '', # Cedilla
'\\xc2\\xb9' : '1', # Superscript 1
'\\xc2\\xba' : '', # Masculine indicator
'\\xc2\\xbb' : '>>', # Double right
'\\xc2\\xbc' : '1/4', # 1/4 fraction
'\\xc2\\xbd' : '1/2', # 1/2 Fraction
'\\xc2\\xbe' : '3/4', # 3/4 Fraction
'\\xc2\\xbf' : '?', # Inverted ?
# German unicode escape sequences
'\\xc3\\x83' : chr(223), # Sharp s es-zett
'\\xc3\\x9f' : chr(223), # Sharp s ?
'\\xc3\\xa4' : chr(228), # a umlaut
'\\xc3\\xb6' : chr(246), # o umlaut
'\\xc3\\xbc' : chr(252), # u umlaut
'\\xc3\\x84' : chr(196), # A umlaut
'\\xc3\\x96' : chr(214), # O umlaut
'\\xc3\\x9c' : chr(220), # U umlaut
# Scandanavian unicode escape sequences
'\\xc2\\x88' : 'A', # aelig
'\\xc2\\xb4' : 'A', # aelig
'\\xc3\\x85' : 'Aa', # Aring
'\\xc3\\x93' : 'O', # O grave
'\\xc3\\xa4' : 'a', # a with double dot
'\\xc3\\xa5' : 'a', # aring
'\\xc3\\x86' : 'AE', # AElig
'\\xc3\\x98' : '0', # O crossed
'\\xc3\\x99' : 'U', # U grave
'\\xc3\\xa6' : 'ae', # aelig
'\\xc3\\xb0' : 'o', # o umlaut
'\\xc3\\xb2' : 'o', # o tilde
'\\xc3\\xb3' : 'o', # o reverse tilde
'\\xc3\\xb4' : 'o', # Capital O circumflex
'\\xc3\\xb8' : 'o', # oslash
# French (Latin) unicode escape sequences
'\\xc3\\x80' : 'A', # A grave
'\\xc3\\x81' : 'A', # A acute
'\\xc3\\x82' : 'A', # A circumflex
'\\xc3\\x83' : 'A', # A tilde
'\\xc3\\x88' : 'E', # E grave
'\\xc3\\x89' : 'E', # E acute
'\\xc3\\x8a' : 'E', # E circumflex
'\\xc3\\xa0' : 'a', # a grave
'\\xc3\\xa1' : 'a', # a acute
'\\xc3\\xa2' : 'a', # a circumflex
'\\xc3\\xa7' : 'c', # c cedilla
'\\xc3\\xa8' : 'e', # e grave
'\\xc3\\xa9' : 'e', # e acute
'\\xc3\\xaa' : 'e', # e circumflex
'\\xc3\\xab' : 'e', # e diaeresis
'\\xc3\\xae' : 'i', # i circumflex
'\\xc3\\xaf' : 'i', # i diaeresis
'\\xc3\\xb7' : "/", # Division sign
'\\xc5\\x93' : 'oe', # oe joined
# Hungarian lower case
'\\xc3\\xb3' : 'o', # o circumflex
'\\xc3\\xad' : 'i', # i accent
'\\xc3\\xb5' : 'o', # o tilde
'\\xc5\\x91' : 'o', # o
'\\xc5\\xb1' : chr(252), #
'\\xc3\\xba' : 'u', # u acute
# Polish unicode escape sequences
'\\xc4\\x84' : 'A', # A,
'\\xc4\\x85' : 'a', # a,
'\\xc4\\x86' : 'C', # C'
'\\xc4\\x87' : 'c', # c'
'\\xc4\\x98' : 'E', # E,
'\\xc4\\x99' : 'e', # e,
'\\xc5\\x81' : 'L', # L/
'\\xc5\\x82' : 'l', # l/
'\\xc5\\x83' : 'N', # N'
'\\xc5\\x84' : 'n', # n'
'\\xc3\\xb1' : 'n', # n'
'\\xc5\\x9a' : 'S', # S'
'\\xc5\\x9b' : 's', # s'
'\\xc5\\xb9' : 'Z', # Z'
'\\xc5\\xba' : 'z', # z'
'\\xc5\\xbb' : 'Z', # Z.
'\\xc5\\xbc' : 'z', # z.
# Greek upper case
'\\xce\\x91' : 'A', # Alpha
'\\xce\\x92' : 'B', # Beta
'\\xce\\x93' : 'G', # Gamma
'\\xce\\x94' : 'D', # Delta
'\\xce\\x95' : 'E', # Epsilon
'\\xce\\x96' : 'Z', # Zeta
'\\xce\\x97' : 'H', # Eta
'\\xce\\x98' : 'TH', # Theta
'\\xce\\x99' : 'I', # Iota
'\\xce\\x9a' : 'K', # Kappa
'\\xce\\x9b' : 'L', # Lamda
'\\xce\\x9c' : 'M', # Mu
'\\xce\\x9e' : 'N', # Nu
'\\xce\\x9f' : 'O', # Omicron
'\\xce\\xa0' : 'Pi', # Pi
'\\xce ' : 'Pi', # Pi ?
'\\xce\\xa1' : 'R', # Rho
'\\xce\\xa3' : 'S', # Sigma
'\\xce\\xa4' : 'T', # Tau
'\\xce\\xa5' : 'Y', # Upsilon
'\\xce\\xa6' : 'F', # Fi
'\\xce\\xa7' : 'X', # Chi
'\\xce\\xa8' : 'PS', # Psi
'\\xce\\xa9' : 'O', # Omega
# Greek lower case
'\\xce\\xb1' : 'a', # Alpha
'\\xce\\xb2' : 'b', # Beta
'\\xce\\xb3' : 'c', # Gamma
'\\xce\\xb4' : 'd', # Delta
'\\xce\\xb5' : 'e', # Epsilon
'\\xce\\xb6' : 'z', # Zeta
'\\xce\\xb7' : 'h', # Eta
'\\xce\\xb8' : 'th', # Theta
'\\xce\\xb9' : 'i', # Iota
'\\xce\\xba' : 'k', # Kappa
'\\xce\\xbb' : 'l', # Lamda
'\\xce\\xbc' : 'm', # Mu
'\\xce\\xbd' : 'v', # Nu
'\\xce\\xbe' : 'ks', # Xi
'\\xce\\xbf' : 'o', # Omicron
'\\xce\\xc0' : 'p', # Pi
'\\xce\\xc1' : 'r', # Rho
'\\xce\\xc3' : 's', # Sigma
'\\xce\\xc4' : 't', # Tau
'\\xce\\xc5' : 'y', # Upsilon
'\\xce\\xc6' : 'f', # Fi
'\\xce\\xc7' : 'x', # Chi
'\\xce\\xc8' : 'ps', # Psi
'\\xce\\xc9' : 'o', # Omega
# Icelandic
'\\xc3\\xbe' : 'p', # Like a p with up stroke
'\\xc3\\xbd' : 'y', # y diaeresis
# Italian characters
'\\xc3\\xac' : 'i', # i reverse circumflex
'\\xc3\\xb9' : 'u', # u reverse circumflex
# Polish (not previously covered)
'\\xc3\\xa3' : 'a', # a tilde
# Romanian
'\\xc4\\x83' : 'a', # a circumflex variant
'\\xc3\\xa2' : 'a', # a circumflex
'\\xc3\\xae' : 'i', # i circumflex
'\\xc5\\x9f' : 's', # s cedilla ?
'\\xc5\\xa3' : 's', # t cedilla ?
'\\xc8\\x99' : 's', # s with down stroke
'\\xc8\\x9b' : 't', # t with down stroke
# Spanish not covered above
'\\xc3\\xb1' : 'n', # n tilde
# Turkish not covered above
'\\xc3\\xbb' : 'u', # u circumflex
'\\xc4\\x9f' : 'g', # g tilde
'\\xc4\\xb1' : 'i', # Looks like an i
'\\xc4\\xb0' : 'I', # Looks like an I
}
# UTF8 codes (Must be checked after above codes checked)
short_codes = {
'\\xa0' : ' ', # Line feed to space
'\\xb4' : "'", # Apostrophe
'\\xc0' : 'A', # A
'\\xc1' : 'A', # A
'\\xc2' : 'A', # A
'\\xc3' : 'A', # A
'\\xc4' : 'A', # A
'\\xc5' : 'A', # A
'\\xc6' : 'Ae', # AE
'\\xc7' : 'C', # C
'\\xc8' : 'E', # E
'\\xc9' : 'E', # E
'\\xca' : 'E', # E
'\\xcb' : 'E', # E
'\\xcc' : 'I', # I
'\\xcd' : 'I', # I
'\\xce' : 'I', # I
'\\xcf' : 'I', # I
'\\xd0' : 'D', # D
'\\xd1' : 'N', # N
'\\xd2' : 'O', # O
'\\xd3' : 'O', # O
'\\xd4' : 'O', # O
'\\xd5' : 'O', # O
'\\xd6' : 'O', # O
'\\xd7' : 'x', # Multiply
'\\xd8' : '0', # O crossed
'\\xd9' : 'U', # U
'\\xda' : 'U', # U
'\\xdb' : 'U', # U
'\\xdc' : 'U', # U umlaut
'\\xdd' : 'Y', # Y
'\\xdf' : 'S', # Sharp s es-zett
'\\xe0' : 'e', # Small a reverse acute
'\\xe1' : 'a', # Small a acute
'\\xe2' : 'a', # Small a circumflex
'\\xe3' : 'a', # Small a tilde
'\\xe4' : 'a', # Small a diaeresis
'\\xe5' : 'aa', # Small a ring above
'\\xe6' : 'ae', # Joined ae
'\\xe7' : 'c', # Small c Cedilla
'\\xe8' : 'e', # Small e grave
'\\xe9' : 'e', # Small e acute
'\\xea' : 'e', # Small e circumflex
'\\xeb' : 'e', # Small e diarisis
'\\xed' : 'i', # Small i acute
'\\xee' : 'i', # Small i circumflex
'\\xf1' : 'n', # Small n tilde
'\\xf3' : 'o', # Small o acute
'\\xf4' : 'o', # Small o circumflex
'\\xf6' : 'o', # o umlaut
'\\xf7' : '/', # Division sign
'\\xf8' : 'oe', # Small o strike through
'\\xf9' : 'u', # Small u circumflex
'\\xfa' : 'u', # Small u acute
'\\xfb' : 'u', # u circumflex
'\\xc0' : 'A', # Small A grave
'\\xc1' : 'A', # Capital A acute
'\\xc7' : 'C', # Capital C Cedilla
'\\xc9' : 'E', # Capital E acute
'\\xcd' : 'I', # Capital I acute
'\\xd3' : 'O', # Capital O acute
'\\xda' : 'U', # Capital U acute
'\\xfc' : 'u', # u umlaut
'\\xbf' : '?', # Spanish Punctuation
'\\xb0' : 'o', # Degrees symbol
}
# HTML codes (RSS feeds)
HtmlCodes = {
# Currency
chr(156) : '#', # Pound by hash
chr(169) : '(c)', # Copyright
# Norwegian
chr(216) : '0', # Oslash
# Spanish french
chr(241) : 'n', # Small tilde n
chr(191) : '?', # Small u acute to u
chr(224) : 'a', # Small a grave to a
chr(225) : 'a', # Small a acute to a
chr(226) : 'a', # Small a circumflex to a
chr(232) : 'e', # Small e grave to e
chr(233) : 'e', # Small e acute to e
chr(234) : 'e', # Small e circumflex to e
chr(235) : 'e', # Small e diarisis to e
chr(237) : 'i', # Small i acute to i
chr(238) : 'i', # Small i circumflex to i
chr(243) : 'o', # Small o acute to o
chr(244) : 'o', # Small o circumflex to o
chr(250) : 'u', # Small u acute to u
chr(251) : 'u', # Small u circumflex to u
chr(192) : 'A', # Capital A grave to A
chr(193) : 'A', # Capital A acute to A
chr(201) : 'E', # Capital E acute to E
chr(205) : 'I', # Capital I acute to I
chr(209) : 'N', # Capital N acute to N
chr(211) : 'O', # Capital O acute to O
chr(218) : 'U', # Capital U acute to U
chr(220) : 'U', # Capital U umlaut to U
chr(231) : 'c', # Small c Cedilla
chr(199) : 'C', # Capital C Cedilla
# German
chr(196) : "Ae", # A umlaut
chr(214) : "Oe", # O umlaut
chr(220) : "Ue", # U umlaut
}
unicodes = {
'\\u201e' : '"', # ORF feed
'\\u3000' : " ",
'\\u201c' : '"',
'\\u201d' : '"',
'\\u0153' : "oe", # French oe
'\\u2009' : ' ', # Short space to space
'\\u2013' : '-', # Long dash to minus sign
'\\u2018' : "'", # Left single quote
'\\u2019' : "'", # Right single quote
# Polish unicodes (I don't know why, but works :) ) (Pecus)
'\\u0104' : "A", # A, (Pecus)
'\\u0105' : "a", # a, (Pecus)
'\\u0106' : "C", # C' (Pecus)
'\\u0107' : "c", # c' (Pecus)
'\\u0118' : "E", # E, (Pecus)
'\\u0119' : "e", # e, (Pecus)
'\\u0141' : "L", # L/ (Pecus)
'\\u0142' : "l", # l/ (Pecus)
'\\u0143' : "N", # N' (Pecus)
'\\u0144' : "n", # n' (Pecus)
#"'xd3" : "O", # O' (Pecus)
#"'xf3" : "o", # o' (Pecus)
'\\u015a' : "S", # S' (Pecus)
'\\u015b' : "s", # s' (Pecus)
'\\u0179' : "Z", # Z' (Pecus)
'\\u017a' : "z", # z' (Pecus)
'\\u017b' : "Z", # Z. (Pecus)
'\\u017c' : "z", # z. (Pecus)
}
def __init__(self):
log.init('radio')
return
# Translate all (Called by rss class)
def all(self,text):
s = self._convert2escape(text)
s = self._escape(s)
s = self._unicode(s)
s = self._html(s)
return s
# Convert unicode to escape codes
def _convert2escape(self,text):
s = repr(text)
if s.__len__() > 2:
#s= s[1:-1] # Strip ' characters
s = s.lstrip('\'')
s = s.rstrip('\'')
return s
# Convert escaped characters (umlauts) to normal characters
def escape(self,text):
s = self._convert2escape(text)
s = self._escape(s)
s = s.lstrip('"')
s = s.rstrip('"')
return s
# Convert escaped characters (umlauts etc.) to normal characters
def _escape(self,text):
s = text
for code in self.codes:
s = s.replace(code, self.codes[code])
for code in self.short_codes:
s = s.replace(code, self.short_codes[code])
s = s.replace("'oC",'oC') # Degrees C fudge
s = s.replace("'oF",'oF') # Degrees C fudge
return s
# HTML translations (callable)
def html(self,text):
s = self._html(s)
_convert_html(s)
return s
# HTML translations
def _html(self,text):
s = text
s = s.replace('&lt;', '<')
s = s.replace('&gt;', '>')
s = s.replace('&quot;', '"')
s = s.replace('&nbsp;', ' ')
s = s.replace('&amp;', '&')
s = s.replace('&copy;', '(c)')
s = s.replace('&apos;', "'") # ' in html (like RSS) (Pecus)
return s
# Convert &#nn sequences
def _convert_html(s):
c = re.findall('&#[0-9][0-9][0-9]', s)
c += re.findall('&#[0-9][0-9]', s)
for html in c:
ch = int(html.replace('&#', ''))
if ch > 31 and ch < 127:
s = s.replace(html,chr(ch))
else:
s = s.replace(html,'')
return s
# Unicodes etc (callable)
def unicode(self,text):
s = self._convert2escape(text)
s = self._unicode(s)
return s
# Unicodes etc
def _unicode(self,text):
s = text
for unicode in self.unicodes:
s = s.replace(unicode, self.unicodes[unicode])
return s
# Decode greek
def decode_greek(self,text):
s = text.decode('macgreek')
return s
# Display umlats as oe ae etc
def displayUmlauts(self,value):
self.displayUmlauts = value
return
# Translate special characters (umlautes etc) to LCD values
# See standard character patterns for LCD display
def toLCD(self,sp):
s = sp
for HtmlCode in self.HtmlCodes:
s = s.replace(HtmlCode, self.HtmlCodes[HtmlCode])
if self.displayUmlauts:
s = s.replace(chr(223), chr(226)) # Sharp s
s = s.replace(chr(246), chr(239)) # o umlaut (Problem in Hungarian?)
s = s.replace(chr(228), chr(225)) # a umlaut
s = s.replace(chr(252), chr(245)) # u umlaut (Problem in Hungarian?)
else:
s = s.replace(chr(228), "ae") # a umlaut
s = s.replace(chr(223), "ss") # Sharp s
s = s.replace(chr(246), "oe") # o umlaut
s = s.replace(chr(252), "ue") # u umlaut
return s
# End of class
# Test translate class
if __name__ == '__main__':
translate = Translate()
if len(sys.argv) > 1:
text = sys.argv[1]
else:
text = 'æ Æ ø Ø å Å'
print text
s = translate._convert2escape(text)
print s
# Complete text
print translate.all(text)
print
sys.exit(0)
# End of file