blob: ff08fb4c87dab39be94b1e613df9b97e04060aff [file] [edit]
import sys
def info(msg, *args):
print(msg % args, file=sys.stderr)
def fail(msg, *args):
info(msg, *args)
sys.exit(2)
# ASCII characters that can appear in a well-formed XML document
# except the characters "$@\^`{}~".
compulsory_chars = (set(b'\t\n\r') | set(range(32, 127))) - set(br'$@\^`{}~')
def check_compatibility(encoding):
for i in compulsory_chars:
c = chr(i)
b = bytes([i])
try:
decoded = b.decode(encoding)
except UnicodeDecodeError:
info('Cannot decode byte %#04x (%a)', i, c)
return False
if decoded != c:
info('Incompatible encoding: %#04x (%a) -> %a', i, c, decoded)
return False
return True
def create_table(encoding):
mapping = [-1] * 256
non_bmp = None
for i in range(0x110000):
char = chr(i)
try:
encoded = char.encode(encoding)
except UnicodeEncodeError:
continue
if i >= 0x10000 and non_bmp is None:
non_bmp = i
info('Non-BMP character: %r (U+%04X)', char, i)
length = len(encoded)
k = encoded[0]
v = mapping[k]
if length == 1:
if v == -1:
mapping[k] = i
elif v < 0:
fail('Ambiguous mapping for %#04x: '
'%r (U+%04X) and %d-byte sequence',
k, c, i, -v)
else:
info('Ambiguous mapping for %#04x: '
'%r (U+%04X) and %r (U+%04X)',
k, chr(v), v, char, i)
else:
if length > 4:
fail('Too long encoding for %r (U+%04X): %r',
char, i, encoded)
if v == -1:
mapping[k] = -length
elif v != -length:
if v < 0:
fail('Ambiguous mapping for %#04x: '
'%d-byte sequence and %d-byte sequence %r',
k, -v, length, encoded)
else:
fail('Ambiguous mapping for %#04x: '
'%r (U+%04X) and %d-byte sequence %r',
k, chr(v), v, length, encoded)
return mapping
def print_table(mapping):
print(' '*8 + '_expat_decoding_table=(', end='')
if mapping[:128] == list(range(128)):
print('*range(128),')
start = 128
else:
print()
start = 0
line = ''
pos = 0
i = start
while True:
if i % 8 == 0:
if len(line) > 68:
print(' '*12 + line[:pos].rstrip())
line = line[pos:]
pos = len(line)
if i == 256:
break
v = mapping[i]
if v >= 0:
v = hex(v)
line += f'{v}, '
i += 1
print(' '*12 + line.rstrip().rstrip(',') + '),')
encoding = sys.argv[1]
if not check_compatibility(encoding):
print(' '*8 + '_expat_decoding_table=False,')
sys.exit(1)
mapping = create_table(encoding)
print_table(mapping)