# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | |
import stringprep, re, codecs | |
from unicodedata import ucd_3_2_0 as unicodedata | |
# IDNA section 3.1 | |
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | |
# IDNA section 5 | |
ace_prefix = "xn--" | |
uace_prefix = unicode(ace_prefix, "ascii") | |
# This assumes query strings, so AllowUnassigned is true | |
def nameprep(label): | |
# Map | |
newlabel = [] | |
for c in label: | |
if stringprep.in_table_b1(c): | |
# Map to nothing | |
continue | |
newlabel.append(stringprep.map_table_b2(c)) | |
label = u"".join(newlabel) | |
# Normalize | |
label = unicodedata.normalize("NFKC", label) | |
# Prohibit | |
for c in label: | |
if stringprep.in_table_c12(c) or \ | |
stringprep.in_table_c22(c) or \ | |
stringprep.in_table_c3(c) or \ | |
stringprep.in_table_c4(c) or \ | |
stringprep.in_table_c5(c) or \ | |
stringprep.in_table_c6(c) or \ | |
stringprep.in_table_c7(c) or \ | |
stringprep.in_table_c8(c) or \ | |
stringprep.in_table_c9(c): | |
raise UnicodeError("Invalid character %r" % c) | |
# Check bidi | |
RandAL = map(stringprep.in_table_d1, label) | |
for c in RandAL: | |
if c: | |
# There is a RandAL char in the string. Must perform further | |
# tests: | |
# 1) The characters in section 5.8 MUST be prohibited. | |
# This is table C.8, which was already checked | |
# 2) If a string contains any RandALCat character, the string | |
# MUST NOT contain any LCat character. | |
if filter(stringprep.in_table_d2, label): | |
raise UnicodeError("Violation of BIDI requirement 2") | |
# 3) If a string contains any RandALCat character, a | |
# RandALCat character MUST be the first character of the | |
# string, and a RandALCat character MUST be the last | |
# character of the string. | |
if not RandAL[0] or not RandAL[-1]: | |
raise UnicodeError("Violation of BIDI requirement 3") | |
return label | |
def ToASCII(label): | |
try: | |
# Step 1: try ASCII | |
label = label.encode("ascii") | |
except UnicodeError: | |
pass | |
else: | |
# Skip to step 3: UseSTD3ASCIIRules is false, so | |
# Skip to step 8. | |
if 0 < len(label) < 64: | |
return label | |
raise UnicodeError("label empty or too long") | |
# Step 2: nameprep | |
label = nameprep(label) | |
# Step 3: UseSTD3ASCIIRules is false | |
# Step 4: try ASCII | |
try: | |
label = label.encode("ascii") | |
except UnicodeError: | |
pass | |
else: | |
# Skip to step 8. | |
if 0 < len(label) < 64: | |
return label | |
raise UnicodeError("label empty or too long") | |
# Step 5: Check ACE prefix | |
if label.startswith(uace_prefix): | |
raise UnicodeError("Label starts with ACE prefix") | |
# Step 6: Encode with PUNYCODE | |
label = label.encode("punycode") | |
# Step 7: Prepend ACE prefix | |
label = ace_prefix + label | |
# Step 8: Check size | |
if 0 < len(label) < 64: | |
return label | |
raise UnicodeError("label empty or too long") | |
def ToUnicode(label): | |
# Step 1: Check for ASCII | |
if isinstance(label, str): | |
pure_ascii = True | |
else: | |
try: | |
label = label.encode("ascii") | |
pure_ascii = True | |
except UnicodeError: | |
pure_ascii = False | |
if not pure_ascii: | |
# Step 2: Perform nameprep | |
label = nameprep(label) | |
# It doesn't say this, but apparently, it should be ASCII now | |
try: | |
label = label.encode("ascii") | |
except UnicodeError: | |
raise UnicodeError("Invalid character in IDN label") | |
# Step 3: Check for ACE prefix | |
if not label.startswith(ace_prefix): | |
return unicode(label, "ascii") | |
# Step 4: Remove ACE prefix | |
label1 = label[len(ace_prefix):] | |
# Step 5: Decode using PUNYCODE | |
result = label1.decode("punycode") | |
# Step 6: Apply ToASCII | |
label2 = ToASCII(result) | |
# Step 7: Compare the result of step 6 with the one of step 3 | |
# label2 will already be in lower case. | |
if label.lower() != label2: | |
raise UnicodeError("IDNA does not round-trip", label, label2) | |
# Step 8: return the result of step 5 | |
return result | |
### Codec APIs | |
class Codec(codecs.Codec): | |
def encode(self,input,errors='strict'): | |
if errors != 'strict': | |
# IDNA is quite clear that implementations must be strict | |
raise UnicodeError("unsupported error handling "+errors) | |
if not input: | |
return "", 0 | |
result = [] | |
labels = dots.split(input) | |
if labels and len(labels[-1])==0: | |
trailing_dot = '.' | |
del labels[-1] | |
else: | |
trailing_dot = '' | |
for label in labels: | |
result.append(ToASCII(label)) | |
# Join with U+002E | |
return ".".join(result)+trailing_dot, len(input) | |
def decode(self,input,errors='strict'): | |
if errors != 'strict': | |
raise UnicodeError("Unsupported error handling "+errors) | |
if not input: | |
return u"", 0 | |
# IDNA allows decoding to operate on Unicode strings, too. | |
if isinstance(input, unicode): | |
labels = dots.split(input) | |
else: | |
# Must be ASCII string | |
input = str(input) | |
unicode(input, "ascii") | |
labels = input.split(".") | |
if labels and len(labels[-1]) == 0: | |
trailing_dot = u'.' | |
del labels[-1] | |
else: | |
trailing_dot = u'' | |
result = [] | |
for label in labels: | |
result.append(ToUnicode(label)) | |
return u".".join(result)+trailing_dot, len(input) | |
class IncrementalEncoder(codecs.BufferedIncrementalEncoder): | |
def _buffer_encode(self, input, errors, final): | |
if errors != 'strict': | |
# IDNA is quite clear that implementations must be strict | |
raise UnicodeError("unsupported error handling "+errors) | |
if not input: | |
return ("", 0) | |
labels = dots.split(input) | |
trailing_dot = u'' | |
if labels: | |
if not labels[-1]: | |
trailing_dot = '.' | |
del labels[-1] | |
elif not final: | |
# Keep potentially unfinished label until the next call | |
del labels[-1] | |
if labels: | |
trailing_dot = '.' | |
result = [] | |
size = 0 | |
for label in labels: | |
result.append(ToASCII(label)) | |
if size: | |
size += 1 | |
size += len(label) | |
# Join with U+002E | |
result = ".".join(result) + trailing_dot | |
size += len(trailing_dot) | |
return (result, size) | |
class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | |
def _buffer_decode(self, input, errors, final): | |
if errors != 'strict': | |
raise UnicodeError("Unsupported error handling "+errors) | |
if not input: | |
return (u"", 0) | |
# IDNA allows decoding to operate on Unicode strings, too. | |
if isinstance(input, unicode): | |
labels = dots.split(input) | |
else: | |
# Must be ASCII string | |
input = str(input) | |
unicode(input, "ascii") | |
labels = input.split(".") | |
trailing_dot = u'' | |
if labels: | |
if not labels[-1]: | |
trailing_dot = u'.' | |
del labels[-1] | |
elif not final: | |
# Keep potentially unfinished label until the next call | |
del labels[-1] | |
if labels: | |
trailing_dot = u'.' | |
result = [] | |
size = 0 | |
for label in labels: | |
result.append(ToUnicode(label)) | |
if size: | |
size += 1 | |
size += len(label) | |
result = u".".join(result) + trailing_dot | |
size += len(trailing_dot) | |
return (result, size) | |
class StreamWriter(Codec,codecs.StreamWriter): | |
pass | |
class StreamReader(Codec,codecs.StreamReader): | |
pass | |
### encodings module API | |
def getregentry(): | |
return codecs.CodecInfo( | |
name='idna', | |
encode=Codec().encode, | |
decode=Codec().decode, | |
incrementalencoder=IncrementalEncoder, | |
incrementaldecoder=IncrementalDecoder, | |
streamwriter=StreamWriter, | |
streamreader=StreamReader, | |
) |