| # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | |
| import stringprep, re, codecs | |
| from unicodedata import ucd_3_2_0 as unicodedata | |
| # IDNA section 3.1 | |
| dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | |
| # IDNA section 5 | |
| ace_prefix = "xn--" | |
| uace_prefix = unicode(ace_prefix, "ascii") | |
| # This assumes query strings, so AllowUnassigned is true | |
| def nameprep(label): | |
| # Map | |
| newlabel = [] | |
| for c in label: | |
| if stringprep.in_table_b1(c): | |
| # Map to nothing | |
| continue | |
| newlabel.append(stringprep.map_table_b2(c)) | |
| label = u"".join(newlabel) | |
| # Normalize | |
| label = unicodedata.normalize("NFKC", label) | |
| # Prohibit | |
| for c in label: | |
| if stringprep.in_table_c12(c) or \ | |
| stringprep.in_table_c22(c) or \ | |
| stringprep.in_table_c3(c) or \ | |
| stringprep.in_table_c4(c) or \ | |
| stringprep.in_table_c5(c) or \ | |
| stringprep.in_table_c6(c) or \ | |
| stringprep.in_table_c7(c) or \ | |
| stringprep.in_table_c8(c) or \ | |
| stringprep.in_table_c9(c): | |
| raise UnicodeError("Invalid character %r" % c) | |
| # Check bidi | |
| RandAL = map(stringprep.in_table_d1, label) | |
| for c in RandAL: | |
| if c: | |
| # There is a RandAL char in the string. Must perform further | |
| # tests: | |
| # 1) The characters in section 5.8 MUST be prohibited. | |
| # This is table C.8, which was already checked | |
| # 2) If a string contains any RandALCat character, the string | |
| # MUST NOT contain any LCat character. | |
| if filter(stringprep.in_table_d2, label): | |
| raise UnicodeError("Violation of BIDI requirement 2") | |
| # 3) If a string contains any RandALCat character, a | |
| # RandALCat character MUST be the first character of the | |
| # string, and a RandALCat character MUST be the last | |
| # character of the string. | |
| if not RandAL[0] or not RandAL[-1]: | |
| raise UnicodeError("Violation of BIDI requirement 3") | |
| return label | |
| def ToASCII(label): | |
| try: | |
| # Step 1: try ASCII | |
| label = label.encode("ascii") | |
| except UnicodeError: | |
| pass | |
| else: | |
| # Skip to step 3: UseSTD3ASCIIRules is false, so | |
| # Skip to step 8. | |
| if 0 < len(label) < 64: | |
| return label | |
| raise UnicodeError("label empty or too long") | |
| # Step 2: nameprep | |
| label = nameprep(label) | |
| # Step 3: UseSTD3ASCIIRules is false | |
| # Step 4: try ASCII | |
| try: | |
| label = label.encode("ascii") | |
| except UnicodeError: | |
| pass | |
| else: | |
| # Skip to step 8. | |
| if 0 < len(label) < 64: | |
| return label | |
| raise UnicodeError("label empty or too long") | |
| # Step 5: Check ACE prefix | |
| if label.startswith(uace_prefix): | |
| raise UnicodeError("Label starts with ACE prefix") | |
| # Step 6: Encode with PUNYCODE | |
| label = label.encode("punycode") | |
| # Step 7: Prepend ACE prefix | |
| label = ace_prefix + label | |
| # Step 8: Check size | |
| if 0 < len(label) < 64: | |
| return label | |
| raise UnicodeError("label empty or too long") | |
| def ToUnicode(label): | |
| # Step 1: Check for ASCII | |
| if isinstance(label, str): | |
| pure_ascii = True | |
| else: | |
| try: | |
| label = label.encode("ascii") | |
| pure_ascii = True | |
| except UnicodeError: | |
| pure_ascii = False | |
| if not pure_ascii: | |
| # Step 2: Perform nameprep | |
| label = nameprep(label) | |
| # It doesn't say this, but apparently, it should be ASCII now | |
| try: | |
| label = label.encode("ascii") | |
| except UnicodeError: | |
| raise UnicodeError("Invalid character in IDN label") | |
| # Step 3: Check for ACE prefix | |
| if not label.startswith(ace_prefix): | |
| return unicode(label, "ascii") | |
| # Step 4: Remove ACE prefix | |
| label1 = label[len(ace_prefix):] | |
| # Step 5: Decode using PUNYCODE | |
| result = label1.decode("punycode") | |
| # Step 6: Apply ToASCII | |
| label2 = ToASCII(result) | |
| # Step 7: Compare the result of step 6 with the one of step 3 | |
| # label2 will already be in lower case. | |
| if label.lower() != label2: | |
| raise UnicodeError("IDNA does not round-trip", label, label2) | |
| # Step 8: return the result of step 5 | |
| return result | |
| ### Codec APIs | |
| class Codec(codecs.Codec): | |
| def encode(self,input,errors='strict'): | |
| if errors != 'strict': | |
| # IDNA is quite clear that implementations must be strict | |
| raise UnicodeError("unsupported error handling "+errors) | |
| if not input: | |
| return "", 0 | |
| result = [] | |
| labels = dots.split(input) | |
| if labels and len(labels[-1])==0: | |
| trailing_dot = '.' | |
| del labels[-1] | |
| else: | |
| trailing_dot = '' | |
| for label in labels: | |
| result.append(ToASCII(label)) | |
| # Join with U+002E | |
| return ".".join(result)+trailing_dot, len(input) | |
| def decode(self,input,errors='strict'): | |
| if errors != 'strict': | |
| raise UnicodeError("Unsupported error handling "+errors) | |
| if not input: | |
| return u"", 0 | |
| # IDNA allows decoding to operate on Unicode strings, too. | |
| if isinstance(input, unicode): | |
| labels = dots.split(input) | |
| else: | |
| # Must be ASCII string | |
| input = str(input) | |
| unicode(input, "ascii") | |
| labels = input.split(".") | |
| if labels and len(labels[-1]) == 0: | |
| trailing_dot = u'.' | |
| del labels[-1] | |
| else: | |
| trailing_dot = u'' | |
| result = [] | |
| for label in labels: | |
| result.append(ToUnicode(label)) | |
| return u".".join(result)+trailing_dot, len(input) | |
| class IncrementalEncoder(codecs.BufferedIncrementalEncoder): | |
| def _buffer_encode(self, input, errors, final): | |
| if errors != 'strict': | |
| # IDNA is quite clear that implementations must be strict | |
| raise UnicodeError("unsupported error handling "+errors) | |
| if not input: | |
| return ("", 0) | |
| labels = dots.split(input) | |
| trailing_dot = u'' | |
| if labels: | |
| if not labels[-1]: | |
| trailing_dot = '.' | |
| del labels[-1] | |
| elif not final: | |
| # Keep potentially unfinished label until the next call | |
| del labels[-1] | |
| if labels: | |
| trailing_dot = '.' | |
| result = [] | |
| size = 0 | |
| for label in labels: | |
| result.append(ToASCII(label)) | |
| if size: | |
| size += 1 | |
| size += len(label) | |
| # Join with U+002E | |
| result = ".".join(result) + trailing_dot | |
| size += len(trailing_dot) | |
| return (result, size) | |
| class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | |
| def _buffer_decode(self, input, errors, final): | |
| if errors != 'strict': | |
| raise UnicodeError("Unsupported error handling "+errors) | |
| if not input: | |
| return (u"", 0) | |
| # IDNA allows decoding to operate on Unicode strings, too. | |
| if isinstance(input, unicode): | |
| labels = dots.split(input) | |
| else: | |
| # Must be ASCII string | |
| input = str(input) | |
| unicode(input, "ascii") | |
| labels = input.split(".") | |
| trailing_dot = u'' | |
| if labels: | |
| if not labels[-1]: | |
| trailing_dot = u'.' | |
| del labels[-1] | |
| elif not final: | |
| # Keep potentially unfinished label until the next call | |
| del labels[-1] | |
| if labels: | |
| trailing_dot = u'.' | |
| result = [] | |
| size = 0 | |
| for label in labels: | |
| result.append(ToUnicode(label)) | |
| if size: | |
| size += 1 | |
| size += len(label) | |
| result = u".".join(result) + trailing_dot | |
| size += len(trailing_dot) | |
| return (result, size) | |
| class StreamWriter(Codec,codecs.StreamWriter): | |
| pass | |
| class StreamReader(Codec,codecs.StreamReader): | |
| pass | |
| ### encodings module API | |
| def getregentry(): | |
| return codecs.CodecInfo( | |
| name='idna', | |
| encode=Codec().encode, | |
| decode=Codec().decode, | |
| incrementalencoder=IncrementalEncoder, | |
| incrementaldecoder=IncrementalDecoder, | |
| streamwriter=StreamWriter, | |
| streamreader=StreamReader, | |
| ) |