codegen/genHtml5LibTests.py - third_party/libxml2 - Git at Google

 #!/usr/bin/env python3

 import glob
 import json
 import re

 state_map = {
     'Data state':          0,
     'RCDATA state':        1,
     'RAWTEXT state':       2,
     'PLAINTEXT state':     3,
     'Script data state':   4,
     'CDATA section state': 5,
 }

 for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
     match = re.search('/([^/]*).test$', filename)
     if match is None:
         continue
     testname = match[1]
     if testname == 'xmlViolation':
         continue

     with open(filename) as json_data:
         root = json.load(json_data)

     test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
     result_out = open(f'result/html-tokenizer/{testname}.test', 'w')

     counter = 0

     for tests in root.values():
         for test in tests:
             input = test['input']

             # Skip surrogate tests
             if re.search(r'\\uD[89A-F]', input, re.I):
                 continue

             input = re.sub(r'\\u([A-Fa-f0-9]{4})',
                            lambda m: chr(int(m[1], 16)),
                            input)

             output = ''
             for token in test['output']:
                 if token[1] == '\0':
                     continue

                 output += token[0] + '\n'

                 if token[0] == 'DOCTYPE':
                     for i in range(1, 4):
                         if token[i] is None:
                             output += '<none>\n'
                         else:
                             output += token[i] + '\n'
                 else:
                     output += token[1]
                     if token[0] == 'StartTag':
                         for name, value in token[2].items():
                             output += f' {name}={value}'
                     output += '\n'

             output = re.sub(r'\\u([A-Fa-f0-9]{4})',
                             lambda m: chr(int(m[1], 16)),
                             output)

             # The HTML5 spec splits handling of U+0000 across
             # tokenizer and tree builder. We already ignore
             # U+0000 in body text when tokenizing.
             output = re.sub(r'\x00', '', output)

             for state in test.get('initialStates', ['Data state']):
                 state_no = state_map.get(state)
                 if state_no is None:
                     raise Exception(f'{filename}: unknown state: {state}')
                 if state_no == 5:
                     continue

                 start_tag = test.get('lastStartTag', '-')

                 test_out.write(f'{counter} {start_tag} {state_no} '
                                f'{len(input.encode())}\n')
                 test_out.write(input)
                 test_out.write('\n')

                 result_out.write(f'{counter}\n')
                 result_out.write(output)

                 counter += 1

         test_out.close()
         result_out.close()
	#!/usr/bin/env python3

	import glob
	import json
	import re

	state_map = {
	'Data state': 0,
	'RCDATA state': 1,
	'RAWTEXT state': 2,
	'PLAINTEXT state': 3,
	'Script data state': 4,
	'CDATA section state': 5,
	}

	for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
	match = re.search('/([^/]*).test$', filename)
	if match is None:
	continue
	testname = match[1]
	if testname == 'xmlViolation':
	continue

	with open(filename) as json_data:
	root = json.load(json_data)

	test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
	result_out = open(f'result/html-tokenizer/{testname}.test', 'w')

	counter = 0

	for tests in root.values():
	for test in tests:
	input = test['input']

	# Skip surrogate tests
	if re.search(r'\\uD[89A-F]', input, re.I):
	continue

	input = re.sub(r'\\u([A-Fa-f0-9]{4})',
	lambda m: chr(int(m[1], 16)),
	input)

	output = ''
	for token in test['output']:
	if token[1] == '\0':
	continue

	output += token[0] + '\n'

	if token[0] == 'DOCTYPE':
	for i in range(1, 4):
	if token[i] is None:
	output += '<none>\n'
	else:
	output += token[i] + '\n'
	else:
	output += token[1]
	if token[0] == 'StartTag':
	for name, value in token[2].items():
	output += f' {name}={value}'
	output += '\n'

	output = re.sub(r'\\u([A-Fa-f0-9]{4})',
	lambda m: chr(int(m[1], 16)),
	output)

	# The HTML5 spec splits handling of U+0000 across
	# tokenizer and tree builder. We already ignore
	# U+0000 in body text when tokenizing.
	output = re.sub(r'\x00', '', output)

	for state in test.get('initialStates', ['Data state']):
	state_no = state_map.get(state)
	if state_no is None:
	raise Exception(f'{filename}: unknown state: {state}')
	if state_no == 5:
	continue

	start_tag = test.get('lastStartTag', '-')

	test_out.write(f'{counter} {start_tag} {state_no} '
	f'{len(input.encode())}\n')
	test_out.write(input)
	test_out.write('\n')

	result_out.write(f'{counter}\n')
	result_out.write(output)

	counter += 1

	test_out.close()
	result_out.close()