Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions Tools/unicode/gen_expat_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import sys


def info(msg, *args):
print(msg % args, file=sys.stderr)

def fail(msg, *args):
info(msg, *args)
sys.exit(2)

# ASCII characters that can appear in a well-formed XML document
# except the characters "$@\^`{}~".
compulsory_chars = (set(b'\t\n\r') | set(range(32, 127))) - set(br'$@\^`{}~')

def check_compatibility(encoding):
for i in compulsory_chars:
c = chr(i)
b = bytes([i])
try:
decoded = b.decode(encoding)
except UnicodeDecodeError:
info('Cannot decode byte %#04x (%a)', i, c)
return False
if decoded != c:
info('Incompatible encoding: %#04x (%a) -> %a', i, c, decoded)
return False
return True

def create_table(encoding):
mapping = [-1] * 256

non_bmp = None
for i in range(0x110000):
char = chr(i)
try:
encoded = char.encode(encoding)
except UnicodeEncodeError:
continue
if i >= 0x10000 and non_bmp is None:
non_bmp = i
info('Non-BMP character: %r (U+%04X)', char, i)
length = len(encoded)
k = encoded[0]
v = mapping[k]
if length == 1:
if v == -1:
mapping[k] = i
elif v < 0:
fail('Ambiguous mapping for %#04x: '
'%r (U+%04X) and %d-byte sequence',
k, c, i, -v)
else:
info('Ambiguous mapping for %#04x: '
'%r (U+%04X) and %r (U+%04X)',
k, chr(v), v, char, i)
else:
if length > 4:
fail('Too long encoding for %r (U+%04X): %r',
char, i, encoded)
if v == -1:
mapping[k] = -length
elif v != -length:
if v < 0:
fail('Ambiguous mapping for %#04x: '
'%d-byte sequence and %d-byte sequence %r',
k, -v, length, encoded)
else:
fail('Ambiguous mapping for %#04x: '
'%r (U+%04X) and %d-byte sequence %r',
k, chr(v), v, length, encoded)
return mapping

def print_table(mapping):
print(' '*8 + '_expat_decoding_table=(', end='')
if mapping[:128] == list(range(128)):
print('*range(128),')
start = 128
else:
print()
start = 0
line = ''
pos = 0
i = start
while True:
if i % 8 == 0:
if len(line) > 68:
print(' '*12 + line[:pos].rstrip())
line = line[pos:]
pos = len(line)
if i == 256:
break
v = mapping[i]
if v >= 0:
v = hex(v)
line += f'{v}, '
i += 1
print(' '*12 + line.rstrip().rstrip(',') + '),')


encoding = sys.argv[1]

if not check_compatibility(encoding):
print(' '*8 + '_expat_decoding_table=False,')
sys.exit(1)

mapping = create_table(encoding)
print_table(mapping)
Loading