Commit 686e55db authored by Elliott Sales de Andrade's avatar Elliott Sales de Andrade Committed by Philipp Zumstein
Browse files

py3k: Remove raw Unicode strings.

They aren't supported in Python 3, and in any case, aren't necessary.
None of the strings actually contain any Unicode, and due to Python 2's
lax str/unicode split don't really need to be.
parent def9e601
......@@ -8,7 +8,7 @@ import re
digits = u"0123456789"
letters = u"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
symbols = ur"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
symbols = u"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
ascii = digits+letters+symbols
xsymbols = u"""€¢£»«›‹÷©®†‡°∙•◦‣¶§÷¡¿▪▫"""
......@@ -60,7 +60,7 @@ replacements = [
def requote(s):
s = unicode(s)
s = re.sub(ur"''",u'"',s)
s = re.sub(r"''",u'"',s)
return s
def requote_fancy(s,germanic=0):
......@@ -68,20 +68,20 @@ def requote_fancy(s,germanic=0):
if germanic:
# germanic quoting style reverses the shapes
# straight double quotes
s = re.sub(ur"\s+''",u"”",s)
s = re.sub(u"''\s+",u"“",s)
s = re.sub(ur"\s+,,",u"„",s)
s = re.sub(r"\s+''",u"”",s)
s = re.sub(r"''\s+",u"“",s)
s = re.sub(r"\s+,,",u"„",s)
# straight single quotes
s = re.sub(ur"\s+'",u"’",s)
s = re.sub(ur"'\s+",u"‘",s)
s = re.sub(ur"\s+,",u"‚",s)
s = re.sub(r"\s+'",u"’",s)
s = re.sub(r"'\s+",u"‘",s)
s = re.sub(r"\s+,",u"‚",s)
else:
# straight double quotes
s = re.sub(ur"\s+''",u"“",s)
s = re.sub(ur"''\s+",u"”",s)
s = re.sub(ur"\s+,,",u"„",s)
s = re.sub(r"\s+''",u"“",s)
s = re.sub(r"''\s+",u"”",s)
s = re.sub(r"\s+,,",u"„",s)
# straight single quotes
s = re.sub(ur"\s+'",u"‘",s)
s = re.sub(ur"'\s+",u"’",s)
s = re.sub(ur"\s+,",u"‚",s)
s = re.sub(r"\s+'",u"‘",s)
s = re.sub(r"'\s+",u"’",s)
s = re.sub(r"\s+,",u"‚",s)
return s
......@@ -110,10 +110,10 @@ def normalize_text(s):
characters."""
s = unicode(s)
s = unicodedata.normalize('NFC',s)
s = re.sub(ur'\s+(?u)',' ',s)
s = re.sub(ur'\n(?u)','',s)
s = re.sub(ur'^\s+(?u)','',s)
s = re.sub(ur'\s+$(?u)','',s)
s = re.sub(r'\s+(?u)',' ',s)
s = re.sub(r'\n(?u)','',s)
s = re.sub(r'^\s+(?u)','',s)
s = re.sub(r'\s+$(?u)','',s)
for m,r in replacements:
s = re.sub(unicode(m),unicode(r),s)
return s
......@@ -122,23 +122,23 @@ def project_text(s,kind="exact"):
"""Project text onto a smaller subset of characters
for comparison."""
s = normalize_text(s)
s = re.sub(ur'( *[.] *){4,}',u'....',s) # dot rows
s = re.sub(ur'[~_]',u'',s) # dot rows
s = re.sub(r'( *[.] *){4,}',u'....',s) # dot rows
s = re.sub(r'[~_]',u'',s) # dot rows
if kind=="exact":
return s
if kind=="nospace":
return re.sub(ur'\s','',s)
return re.sub(r'\s','',s)
if kind=="spletdig":
return re.sub(ur'[^A-Za-z0-9 ]','',s)
return re.sub(r'[^A-Za-z0-9 ]','',s)
if kind=="letdig":
return re.sub(ur'[^A-Za-z0-9]','',s)
return re.sub(r'[^A-Za-z0-9]','',s)
if kind=="letters":
return re.sub(ur'[^A-Za-z]','',s)
return re.sub(r'[^A-Za-z]','',s)
if kind=="digits":
return re.sub(ur'[^0-9]','',s)
return re.sub(r'[^0-9]','',s)
if kind=="lnc":
s = s.upper()
return re.sub(ur'[^A-Z]','',s)
return re.sub(r'[^A-Z]','',s)
raise BadInput("unknown normalization: "+kind)
################################################################
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment