New translate class, but not final...

This commit is contained in:
Pecusx
2017-06-04 19:34:33 +02:00
committed by GitHub
parent c64206c8e0
commit 717575d052
2 changed files with 194 additions and 94 deletions
+7 -3
View File
@@ -1,7 +1,7 @@
#!/usr/bin/python #!/usr/bin/python
# -*- coding: latin-1 -*- # -*- coding: latin-1 -*-
# #
# $Id: rss_class.py,v 1.23 2016/06/28 06:40:51 bob Exp $ # $Id: rss_class.py,v 1.25 2017/04/26 08:18:25 bob Exp $
# Raspberry Pi RSS feed class # Raspberry Pi RSS feed class
# #
# Author : Bob Rathbone # Author : Bob Rathbone
@@ -63,9 +63,13 @@ class Rss:
self.feed_available = True self.feed_available = True
line = self.rss.pop() line = self.rss.pop()
self.length -= 1 self.length -= 1
line = line.lstrip('<')
feed = translate.all(line) feed = translate.all(line)
feed = feed.lstrip('u"')
feed = feed.lstrip("u'")
feed = feed.lstrip('"') feed = feed.lstrip('"')
feed = feed.lstrip('<') feed = feed.rstrip('"')
if not self.rss_error: if not self.rss_error:
log.message(feed,log.DEBUG) log.message(feed,log.DEBUG)
return feed return feed
@@ -94,7 +98,7 @@ class Rss:
self.rss_error = True # Set RSS error self.rss_error = True # Set RSS error
rss.append("No RSS feed found") rss.append("No RSS feed found")
return rss return rss
def parse_feed(self,dom): def parse_feed(self,dom):
rss = [] rss = []
for news in dom.getElementsByTagName('*'): for news in dom.getElementsByTagName('*'):
+187 -91
View File
@@ -1,10 +1,10 @@
#!/usr/bin/python #!/usr/bin/env python
# -*- coding: latin-1 -*- # -*- coding: latin-1 -*-
# #
# Raspberry Pi Radio Character translation class # Raspberry Pi Radio Character translation class
# Escaped characters, html and unicode translation to ascii # Escaped characters, html and unicode translation to ascii
# #
# $Id: translate_class.py,v 1.24 2016/04/14 06:37:56 bob Exp $ # $Id: translate_class.py,v 1.37 2017/05/16 11:48:02 bob Exp $
# #
# Author : Bob Rathbone # Author : Bob Rathbone
# Site : http://www.bobrathbone.com # Site : http://www.bobrathbone.com
@@ -17,10 +17,10 @@
# Useful Links on character encodings # Useful Links on character encodings
# http://www.zytrax.com/tech/web/entities.html # http://www.zytrax.com/tech/web/entities.html
# http://www.utf8-chartable.de/ # http://www.utf8-chartable.de/
# # http://www.codetable.net/
# http://www.ascii-code.com/
import os import os,sys
import time import time
import unicodedata import unicodedata
from log_class import Log from log_class import Log
@@ -35,11 +35,16 @@ class Translate:
codes = { codes = {
'//' : '/', # Double / '//' : '/', # Double /
' ' : ' ', # Double spaces ' ' : ' ', # Double spaces
'\\xa0' : ' ', # Line feed to space
'\\' : "'", # Double bacslash to apostrophe
'\\n' : ' ', # Line feed to space '\\n' : ' ', # Line feed to space
# German UTF8 codes
'\\xef\\xbf\\xbd' : chr(246),
# Currencies
'\\xe2\\x82\\xac' : ' Euro ',
# Special characters # Special characters
'\\x80\\x99' : "'", # Single quote
'\\xc2\\xa1' : '!', # Inverted exclamation '\\xc2\\xa1' : '!', # Inverted exclamation
'\\xc2\\xa2' : 'c', # Cent sign '\\xc2\\xa2' : 'c', # Cent sign
'\\xc2\\xa3' : '#', # Pound sign '\\xc2\\xa3' : '#', # Pound sign
@@ -82,21 +87,22 @@ class Translate:
'\\xc3\\x96' : chr(214), # O umlaut '\\xc3\\x96' : chr(214), # O umlaut
'\\xc3\\x9c' : chr(220), # U umlaut '\\xc3\\x9c' : chr(220), # U umlaut
# Norwegian unicode escape sequences # Scandanavian unicode escape sequences
'\\xc3\\x98' : 'O', # Oslash '\\xc2\\x88' : 'A', # aelig
'\\xc3\\xb8' : 'o', # Oslash '\\xc2\\xb4' : 'A', # aelig
'\\xc3\\x85' : 'A', # Aring '\\xc3\\x85' : 'Aa', # Aring
'\\xc3\\x93' : 'O', # O grave '\\xc3\\x93' : 'O', # O grave
'\\xc3\\xa4' : 'a', # a with double dot
'\\xc3\\xa5' : 'a', # aring '\\xc3\\xa5' : 'a', # aring
'\\xc3\\x86' : 'AE', # AElig '\\xc3\\x86' : 'AE', # AElig
'\\xc3\\x98' : 'O', # O crossed '\\xc3\\x98' : '0', # O crossed
'\\xc3\\x99' : 'U', # U grave '\\xc3\\x99' : 'U', # U grave
'\\xc3\\xa6' : 'ae', # aelig '\\xc3\\xa6' : 'ae', # aelig
'\\xc3\\xb0' : 'o', # o umlaut '\\xc3\\xb0' : 'o', # o umlaut
'\\xc3\\xb3' : 'o', # o tilde '\\xc3\\xb2' : 'o', # o tilde
'\\xc3\\xb3' : 'o', # o reverse tilde
'\\xc3\\xb4' : 'o', # Capital O circumflex
'\\xc3\\xb8' : 'o', # oslash '\\xc3\\xb8' : 'o', # oslash
'\\xc2\\x88' : 'A', # aelig
'\\xc2\\xb4' : 'A', # aelig
# French (Latin) unicode escape sequences # French (Latin) unicode escape sequences
'\\xc3\\x80' : 'A', # A grave '\\xc3\\x80' : 'A', # A grave
@@ -106,22 +112,26 @@ class Translate:
'\\xc3\\x88' : 'E', # E grave '\\xc3\\x88' : 'E', # E grave
'\\xc3\\x89' : 'E', # E acute '\\xc3\\x89' : 'E', # E acute
'\\xc3\\x8a' : 'E', # E circumflex '\\xc3\\x8a' : 'E', # E circumflex
'\\xc3\\xa0' : chr(224), # a grave '\\xc3\\xa0' : 'a', # a grave
'\\xc3\\xa1' : chr(225), # a acute '\\xc3\\xa1' : 'a', # a acute
'\\xc3\\xa2' : chr(226), # a circumflex '\\xc3\\xa2' : 'a', # a circumflex
'\\xc3\\xa8' : chr(232), # e grave '\\xc3\\xa7' : 'c', # c cedilla
'\\xc3\\xa9' : chr(233), # e acute '\\xc3\\xa8' : 'e', # e grave
'\\xc3\\xaa' : chr(234), # e circumflex '\\xc3\\xa9' : 'e', # e acute
'\\xc3\\xb6' : "'", # Hyphon '\\xc3\\xaa' : 'e', # e circumflex
'\\xc3\\xab' : 'e', # e diaeresis
'\\xc3\\xae' : 'i', # i circumflex
'\\xc3\\xaf' : 'i', # i diaeresis
'\\xc3\\xb7' : "/", # Division sign '\\xc3\\xb7' : "/", # Division sign
'\\xc5\\x93' : 'oe', # oe joined
# Hungarian lower case # Hungarian lower case
'\\xc3\\xb3' : chr(243), # '\\xc3\\xb3' : 'o', # o circumflex
'\\xc3\\xad' : chr(237), # '\\xc3\\xad' : 'i', # i accent
'\\xc3\\xb5' : chr(245), # '\\xc3\\xb5' : 'o', # o tilde
'\\xc5\\x91' : chr(245), # '\\xc5\\x91' : 'o', # o
'\\xc5\\xb1' : chr(252), # '\\xc5\\xb1' : chr(252), #
'\\xc3\\xba' : chr(250), # Ã '\\xc3\\xba' : 'u', # u acute
# Polish unicode escape sequences # Polish unicode escape sequences
'\\xc4\\x84' : 'A', # A, '\\xc4\\x84' : 'A', # A,
@@ -194,58 +204,116 @@ class Translate:
'\\xce\\xc8' : 'ps', # Psi '\\xce\\xc8' : 'ps', # Psi
'\\xce\\xc9' : 'o', # Omega '\\xce\\xc9' : 'o', # Omega
# Currency other special character # Icelandic
'\\xa3' : chr(156), # UK pound sign '\\xc3\\xbe' : 'p', # Like a p with up stroke
'\\xa9' : chr(169), # Copyright '\\xc3\\xbd' : 'y', # y diaeresis
# German short hex representation # Italian characters
'\\xdf' : chr(223), # Sharp s es-zett '\\xc3\\xac' : 'i', # i reverse circumflex
'\\xe4' : chr(228), # a umlaut '\\xc3\\xb9' : 'u', # u reverse circumflex
'\\xf6' : chr(246), # o umlaut
'\\xfc' : chr(252), # u umlaut
'\\xc4' : chr(196), # A umlaut
'\\xd6' : chr(214), # O umlaut
'\\xdc' : chr(220), # U umlaut
# Spanish and French # Polish (not previously covered)
'\\xe0' : chr(224), # Small a reverse acute '\\xc3\\xa3' : 'a', # a tilde
'\\xe1' : chr(225), # Small a acute
'\\xe2' : chr(226), # Small audo bashcircumflex
'\\xe7' : chr(231), # Small c Cedilla
'\\xe8' : chr(232), # Small e grave
'\\xe9' : chr(233), # Small e acute
'\\xea' : chr(234), # Small e circumflex
'\\xeb' : chr(235), # Small e diarisis
'\\xed' : chr(237), # Small i acute
'\\xee' : chr(238), # Small i circumflex
'\\xf1' : chr(241), # Small n tilde
'\\xf3' : chr(243), # Small o acute
'\\xf4' : chr(244), # Small o circumflex
'\\xf9' : chr(249), # Small u circumflex
'\\xfa' : chr(250), # Small u acute
'\\xfb' : chr(251), # u circumflex
'\\xc0' : chr(192), # Small A grave # Romanian
'\\xc1' : chr(193), # Capital A acute '\\xc4\\x83' : 'a', # a circumflex variant
'\\xc3\\xa2' : 'a', # a circumflex
'\\xc3\\xae' : 'i', # i circumflex
'\\xc5\\x9f' : 's', # s cedilla ?
'\\xc5\\xa3' : 's', # t cedilla ?
'\\xc8\\x99' : 's', # s with down stroke
'\\xc8\\x9b' : 't', # t with down stroke
'\\xc7' : chr(199), # Capital C Cedilla # Spanish not covered above
'\\xc9' : chr(201), # Capital E acute '\\xc3\\xb1' : 'n', # n tilde
'\\xcd' : chr(205), # Capital I acute
'\\xd3' : chr(211), # Capital O acute
'\\xda' : chr(218), # Capital U acute
'\\xbf' : chr(191), # Spanish Punctuation # Turkish not covered above
'\\xc3\\xbb' : 'u', # u circumflex
'xb0' : 'o', # Degres symbol '\\xc4\\x9f' : 'g', # g tilde
'\\xc4\\xb1' : 'i', # Looks like an i
'\\xc4\\xb0' : 'I', # Looks like an I
} }
# UTF8 codes (Must be checked after above codes checked)
short_codes = {
'\\xa0' : ' ', # Line feed to space
'\\xb4' : "'", # Apostrophe
'\\xc0' : 'A', # A
'\\xc1' : 'A', # A
'\\xc2' : 'A', # A
'\\xc3' : 'A', # A
'\\xc4' : 'A', # A
'\\xc5' : 'A', # A
'\\xc6' : 'Ae', # AE
'\\xc7' : 'C', # C
'\\xc8' : 'E', # E
'\\xc9' : 'E', # E
'\\xca' : 'E', # E
'\\xcb' : 'E', # E
'\\xcc' : 'I', # I
'\\xcd' : 'I', # I
'\\xce' : 'I', # I
'\\xcf' : 'I', # I
'\\xd0' : 'D', # D
'\\xd1' : 'N', # N
'\\xd2' : 'O', # O
'\\xd3' : 'O', # O
'\\xd4' : 'O', # O
'\\xd5' : 'O', # O
'\\xd6' : 'O', # O
'\\xd7' : 'x', # Multiply
'\\xd8' : '0', # O crossed
'\\xd9' : 'U', # U
'\\xda' : 'U', # U
'\\xdb' : 'U', # U
'\\xdc' : 'U', # U umlaut
'\\xdd' : 'Y', # Y
'\\xdf' : 'S', # Sharp s es-zett
'\\xe0' : 'e', # Small a reverse acute
'\\xe1' : 'a', # Small a acute
'\\xe2' : 'a', # Small a circumflex
'\\xe3' : 'a', # Small a tilde
'\\xe4' : 'a', # Small a diaeresis
'\\xe5' : 'aa', # Small a ring above
'\\xe6' : 'ae', # Joined ae
'\\xe7' : 'c', # Small c Cedilla
'\\xe8' : 'e', # Small e grave
'\\xe9' : 'e', # Small e acute
'\\xea' : 'e', # Small e circumflex
'\\xeb' : 'e', # Small e diarisis
'\\xed' : 'i', # Small i acute
'\\xee' : 'i', # Small i circumflex
'\\xf1' : 'n', # Small n tilde
'\\xf3' : 'o', # Small o acute
'\\xf4' : 'o', # Small o circumflex
'\\xf6' : 'o', # o umlaut
'\\xf7' : '/', # Division sign
'\\xf8' : 'oe', # Small o strike through
'\\xf9' : 'u', # Small u circumflex
'\\xfa' : 'u', # Small u acute
'\\xfb' : 'u', # u circumflex
'\\xc0' : 'A', # Small A grave
'\\xc1' : 'A', # Capital A acute
'\\xc7' : 'C', # Capital C Cedilla
'\\xc9' : 'E', # Capital E acute
'\\xcd' : 'I', # Capital I acute
'\\xd3' : 'O', # Capital O acute
'\\xda' : 'U', # Capital U acute
'\\xfc' : 'u', # u umlaut
'\\xbf' : '?', # Spanish Punctuation
'\\xb0' : 'o', # Degrees symbol
}
# HTML codes (RSS feeds)
HtmlCodes = { HtmlCodes = {
# Currency # Currency
chr(156) : '#', # Pound by hash chr(156) : '#', # Pound by hash
chr(169) : '(c)', # Copyright chr(169) : '(c)', # Copyright
# Norwegian # Norwegian
chr(216) : 'O', # Oslash chr(216) : '0', # Oslash
# Spanish french # Spanish french
chr(241) : 'n', # Small tilde n chr(241) : 'n', # Small tilde n
@@ -278,7 +346,8 @@ class Translate:
chr(196) : "Ae", # A umlaut chr(196) : "Ae", # A umlaut
chr(214) : "Oe", # O umlaut chr(214) : "Oe", # O umlaut
chr(220) : "Ue", # U umlaut chr(220) : "Ue", # U umlaut
} }
unicodes = { unicodes = {
'\\u201e' : '"', # ORF feed '\\u201e' : '"', # ORF feed
@@ -288,34 +357,34 @@ class Translate:
'\\u0153' : "oe", # French oe '\\u0153' : "oe", # French oe
'\\u2009' : ' ', # Short space to space '\\u2009' : ' ', # Short space to space
'\\u2013' : '-', # Long dash to minus sign '\\u2013' : '-', # Long dash to minus sign
'\\u2019' : "'", # French apostrophe '\\u2018' : "'", # Left single quote
'\\u2019' : "'", # Right single quote
# Polish unicodes (I don't know why, but works :) ) (Pecus) # Polish unicodes (I don't know why, but works :) ) (Pecus)
"'u0104" : "A", # A, (Pecus) '\\u0104' : "A", # A, (Pecus)
"'u0105" : "a", # a, (Pecus) '\\u0105' : "a", # a, (Pecus)
"'u0106" : "C", # C' (Pecus) '\\u0106' : "C", # C' (Pecus)
"'u0107" : "c", # c' (Pecus) '\\u0107' : "c", # c' (Pecus)
"'u0118" : "E", # E, (Pecus) '\\u0118' : "E", # E, (Pecus)
"'u0119" : "e", # e, (Pecus) '\\u0119' : "e", # e, (Pecus)
"'u0141" : "L", # L/ (Pecus) '\\u0141' : "L", # L/ (Pecus)
"'u0142" : "l", # l/ (Pecus) '\\u0142' : "l", # l/ (Pecus)
"'u0143" : "N", # N' (Pecus) '\\u0143' : "N", # N' (Pecus)
"'u0144" : "n", # n' (Pecus) '\\u0144' : "n", # n' (Pecus)
"'xd3" : "O", # O' (Pecus) #"'xd3" : "O", # O' (Pecus)
"'xf3" : "o", # o' (Pecus) #"'xf3" : "o", # o' (Pecus)
"'u015a" : "S", # S' (Pecus) '\\u015a' : "S", # S' (Pecus)
"'u015b" : "s", # s' (Pecus) '\\u015b' : "s", # s' (Pecus)
"'u0179" : "Z", # Z' (Pecus) '\\u0179' : "Z", # Z' (Pecus)
"'u017a" : "z", # z' (Pecus) '\\u017a' : "z", # z' (Pecus)
"'u017b" : "Z", # Z. (Pecus) '\\u017b' : "Z", # Z. (Pecus)
"'u017c" : "z", # z. (Pecus) '\\u017c' : "z", # z. (Pecus)
} }
def __init__(self): def __init__(self):
log.init('radio') log.init('radio')
return return
# Translate all # Translate all (Called by rss class)
def all(self,text): def all(self,text):
s = self._convert2escape(text) s = self._convert2escape(text)
s = self._escape(s) s = self._escape(s)
@@ -327,21 +396,29 @@ class Translate:
def _convert2escape(self,text): def _convert2escape(self,text):
s = repr(text) s = repr(text)
if s.__len__() > 2: if s.__len__() > 2:
s= s[1:-1] # Strip ' characters #s= s[1:-1] # Strip ' characters
s = s.lstrip("'") s = s.lstrip('\'')
s = s.rstrip('\'')
return s return s
# Convert escaped characters (umlauts) to normal characters # Convert escaped characters (umlauts) to normal characters
def escape(self,text): def escape(self,text):
s = self._convert2escape(text) s = self._convert2escape(text)
s = self._escape(s) s = self._escape(s)
s = s.lstrip('"')
s = s.rstrip('"')
return s return s
# Convert escaped characters (umlauts etc.) to normal characters # Convert escaped characters (umlauts etc.) to normal characters
def _escape(self,text): def _escape(self,text):
s = text s = text
for code in self.codes: for code in self.codes:
s = s.replace(code, self.codes[code]) s = s.replace(code, self.codes[code])
for code in self.short_codes:
s = s.replace(code, self.short_codes[code])
s = s.replace("'oC",'oC') # Degrees C fudge s = s.replace("'oC",'oC') # Degrees C fudge
s = s.replace("'oF",'oF') # Degrees C fudge s = s.replace("'oF",'oF') # Degrees C fudge
return s return s
@@ -419,3 +496,22 @@ class Translate:
return s return s
# End of class # End of class
# Test translate class
if __name__ == '__main__':
translate = Translate()
if len(sys.argv) > 1:
text = sys.argv[1]
else:
text = 'æ Æ ø Ø å Å'
print text
s = translate._convert2escape(text)
print s
# Complete text
print translate.all(text)
print
sys.exit(0)
# End of file