python 文本和字节序列

pyuac对Unicode排序

import pyuca coll = pyuca.Collator() fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola'] sorted_fruits = sorted(fruits, key=coll.sort_key) sorted_fruits ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

正则对于特殊字符的匹配

BEGIN NUMERICS_DEMO

import unicodedata import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample: print('U+%04x' % ord(char), # <1> char.center(6), # <2> 're_dig' if re_digit.match(char) else '-', # <3> 'isdig' if char.isdigit() else '-', # <4> 'isnum' if char.isnumeric() else '-', # <5> format(unicodedata.numeric(char), '5.2f'), # <6> unicodedata.name(char), # <7> sep='\t')

END NUMERICS_DEMO

特殊字符

BEGIN RE_DEMO

import re

re_numbers_str = re.compile(r'\d+') # <1> re_words_str = re.compile(r'\w+') re_numbers_bytes = re.compile(rb'\d+') # <2> re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" # <3> " as 1729 = 1³ + 12³ = 9³ + 10³.") # <4>

text_bytes = text_str.encode('utf_8') # <5>

print('Text', repr(text_str), sep='\n ') print('Numbers') print(' str :', re_numbers_str.findall(text_str)) # <6> print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7> print('Words') print(' str :', re_words_str.findall(text_str)) # <8> print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>

END RE_DEMO

################################

""" Radical folding and text sanitizing.

Handling a string with cp1252 symbols:

>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
>>> shave_marks(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> shave_marks_latin(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> dewinize(order)
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
>>> asciize(order)
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'

Handling a string with Greek and Latin accented characters:

>>> greek = 'Ζέφυρος, Zéfiro'
>>> shave_marks(greek)
'Ζεφυρος, Zefiro'
>>> shave_marks_latin(greek)
'Ζέφυρος, Zefiro'
>>> dewinize(greek)
'Ζέφυρος, Zéfiro'
>>> asciize(greek)
'Ζέφυρος, Zefiro'

"""

BEGIN SHAVE_MARKS

import unicodedata import string

def shave_marks(txt): """Remove all diacritic marks""" norm_txt = unicodedata.normalize('NFD', txt) # <1> shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c)) # <2> return unicodedata.normalize('NFC', shaved) # <3>

END SHAVE_MARKS

BEGIN SHAVE_MARKS_LATIN

def shave_marks_latin(txt): """Remove all diacritic marks from Latin base characters""" norm_txt = unicodedata.normalize('NFD', txt) # <1> latin_base = False keepers = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: # <2> continue # ignore diacritic on Latin base char keepers.append(c) # <3> # if it isn't combining char, it's a new base char if not unicodedata.combining(c): # <4> latin_base = c in string.ascii_letters shaved = ''.join(keepers) return unicodedata.normalize('NFC', shaved) # <5>

END SHAVE_MARKS_LATIN

BEGIN ASCIIZE

single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1> """'f"*^<''""---~>""")

multi_map = str.maketrans({ # <2> '€': '<euro>', '…': '...', 'Œ': 'OE', '™': '(TM)', 'œ': 'oe', '‰': '<per mille>', '‡': '**', })

multi_map.update(single_map) # <3>

def dewinize(txt): """Replace Win1252 symbols with ASCII chars or sequences""" return txt.translate(multi_map) # <4>

def asciize(txt): no_marks = shave_marks_latin(dewinize(txt)) # <5> no_marks = no_marks.replace('ß', 'ss') # <6> return unicodedata.normalize('NFKC', no_marks) # <7>

END ASCIIZE

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券