import pyuca coll = pyuca.Collator() fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola'] sorted_fruits = sorted(fruits, key=coll.sort_key) sorted_fruits ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
import unicodedata import re
re_digit = re.compile(r'\d')
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
for char in sample: print('U+%04x' % ord(char), # <1> char.center(6), # <2> 're_dig' if re_digit.match(char) else '-', # <3> 'isdig' if char.isdigit() else '-', # <4> 'isnum' if char.isnumeric() else '-', # <5> format(unicodedata.numeric(char), '5.2f'), # <6> unicodedata.name(char), # <7> sep='\t')
import re
re_numbers_str = re.compile(r'\d+') # <1> re_words_str = re.compile(r'\w+') re_numbers_bytes = re.compile(rb'\d+') # <2> re_words_bytes = re.compile(rb'\w+')
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" # <3> " as 1729 = 1³ + 12³ = 9³ + 10³.") # <4>
text_bytes = text_str.encode('utf_8') # <5>
print('Text', repr(text_str), sep='\n ') print('Numbers') print(' str :', re_numbers_str.findall(text_str)) # <6> print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7> print('Words') print(' str :', re_words_str.findall(text_str)) # <8> print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
################################
""" Radical folding and text sanitizing.
Handling a string with cp1252
symbols:
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
>>> shave_marks(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> shave_marks_latin(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> dewinize(order)
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
>>> asciize(order)
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
Handling a string with Greek and Latin accented characters:
>>> greek = 'Ζέφυρος, Zéfiro'
>>> shave_marks(greek)
'Ζεφυρος, Zefiro'
>>> shave_marks_latin(greek)
'Ζέφυρος, Zefiro'
>>> dewinize(greek)
'Ζέφυρος, Zéfiro'
>>> asciize(greek)
'Ζέφυρος, Zefiro'
"""
import unicodedata import string
def shave_marks(txt): """Remove all diacritic marks""" norm_txt = unicodedata.normalize('NFD', txt) # <1> shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c)) # <2> return unicodedata.normalize('NFC', shaved) # <3>
def shave_marks_latin(txt): """Remove all diacritic marks from Latin base characters""" norm_txt = unicodedata.normalize('NFD', txt) # <1> latin_base = False keepers = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: # <2> continue # ignore diacritic on Latin base char keepers.append(c) # <3> # if it isn't combining char, it's a new base char if not unicodedata.combining(c): # <4> latin_base = c in string.ascii_letters shaved = ''.join(keepers) return unicodedata.normalize('NFC', shaved) # <5>
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1> """'f"*^<''""---~>""")
multi_map = str.maketrans({ # <2> '€': '<euro>', '…': '...', 'Œ': 'OE', '™': '(TM)', 'œ': 'oe', '‰': '<per mille>', '‡': '**', })
multi_map.update(single_map) # <3>
def dewinize(txt): """Replace Win1252 symbols with ASCII chars or sequences""" return txt.translate(multi_map) # <4>
def asciize(txt): no_marks = shave_marks_latin(dewinize(txt)) # <5> no_marks = no_marks.replace('ß', 'ss') # <6> return unicodedata.normalize('NFKC', no_marks) # <7>