genUnicode.py - third_party/github.com/GNOME/libxml2 - Git at Google

 #!/usr/bin/python -u
 import sys
 import string
 import time

 sources = "Blocks-4.txt UnicodeData-3.1.0.txt"

 try:
     blocks = open("Blocks-4.txt", "r")
 except:
     print "Missing Blocks-4.txt, aborting ..."
     sys.exit(1)

 BlockNames = {}
 for line in blocks.readlines():
     if line[0] == '#':
         continue
     line = string.strip(line)
     if line == '':
 	continue
     try:
 	fields = string.split(line, ';')
 	range = string.strip(fields[0])
 	(start, end) = string.split(range, "..")
 	name = string.strip(fields[1])
 	name = string.replace(name, ' ', '')
     except:
         print "Failed to process line: %s" % (line)
 	continue
     BlockNames[name] = ("0x"+start, "0x"+end)
 blocks.close()
 print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))

 try:
     data = open("UnicodeData-3.1.0.txt", "r")
 except:
     print "Missing UnicodeData-3.1.0.txt, aborting ..."
     sys.exit(1)

 nbchar = 0;
 Categories = {}
 for line in data.readlines():
     if line[0] == '#':
         continue
     line = string.strip(line)
     if line == '':
 	continue
     try:
 	fields = string.split(line, ';')
 	point = string.strip(fields[0])
 	value = 0
 	while point != '':
 	    value = value * 16
 	    if point[0] >= '0' and point[0] <= '9':
 	        value = value + ord(point[0]) - ord('0')
 	    elif point[0] >= 'A' and point[0] <= 'F':
 	        value = value + 10 + ord(point[0]) - ord('A')
 	    elif point[0] >= 'a' and point[0] <= 'f':
 	        value = value + 10 + ord(point[0]) - ord('a')
 	    point = point[1:]
 	name = fields[2]
     except:
         print "Failed to process line: %s" % (line)
 	continue

     nbchar = nbchar + 1
     try:
 	Categories[name].append(value)
     except:
         try:
 	    Categories[name] = [value]
 	except:
 	    print "Failed to process line: %s" % (line)
     try:
 	Categories[name[0]].append(value)
     except:
         try:
 	    Categories[name[0]] = [value]
 	except:
 	    print "Failed to process line: %s" % (line)

 blocks.close()
 print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
 #reduce the number list into ranges
 for cat in Categories.keys():
     list = Categories[cat]
     start = -1
     prev = -1
     end = -1
     ranges = []
     for val in list:
         if start == -1:
 	    start = val
 	    prev = val
 	    continue
 	elif val == prev + 1:
 	    prev = val
 	    continue
 	elif prev == start:
 	    ranges.append((prev, prev))
 	    start = val
 	    prev = val
 	    continue
 	else:
 	    ranges.append((start, prev))
 	    start = val
 	    prev = val
 	    continue
     if prev == start:
         ranges.append((prev, prev))
     else:
         ranges.append((start, prev))
     Categories[cat] = ranges

 #
 # Generate the resulting files
 #
 try:
     header = open("xmlunicode.h", "w")
 except:
     print "Failed to open xmlunicode.h"
     sys.exit(1)

 try:
     output = open("xmlunicode.c", "w")
 except:
     print "Failed to open xmlunicode.c"
     sys.exit(1)

 date = time.asctime(time.localtime(time.time()))

 header.write(
 """/*
  * xmlunicode.h: this header exports interfaces for the Unicode character APIs
  *
  * This file is automatically generated from the
  * UCS description files of the Unicode Character Database
  * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
  * using the genUnicode.py Python script.
  *
  * Generation date: %s
  * Sources: %s
  * Daniel Veillard <veillard@redhat.com>
  */

 #ifndef __XML_UNICODE_H__
 #define __XML_UNICODE_H__

 #ifdef __cplusplus
 extern "C" {
 #endif

 """ % (date, sources));
 output.write(
 """/*
  * xmlunicode.c: this module implements the Unicode character APIs
  *
  * This file is automatically generated from the
  * UCS description files of the Unicode Character Database
  * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
  * using the genUnicode.py Python script.
  *
  * Generation date: %s
  * Sources: %s
  * Daniel Veillard <veillard@redhat.com>
  */

 #define IN_LIBXML
 #include "libxml.h"

 #ifdef LIBXML_UNICODE_ENABLED

 #include <string.h>
 #include <libxml/xmlversion.h>
 #include <libxml/xmlunicode.h>

 """ % (date, sources));

 keys = BlockNames.keys()
 keys.sort()
 for block in keys:
     (start, end) = BlockNames[block]
     name = string.replace(block, '-', '')
     header.write("int\txmlUCSIs%s\t(int code);\n" % name)
     output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
     output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
                  (block))
     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
     output.write("int\nxmlUCSIs%s(int code) {\n" % name)
     output.write("    return((code >= %s) && (code <= %s));\n" % (start, end))
     output.write("}\n\n")

 header.write("\nint\txmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n")
 output.write("/**\n * xmlUCSIsBlock:\n * @code: UCS code point\n")
 output.write(" * @block: UCS block name\n")
 output.write(" *\n * Check whether the caracter is part of the UCS Block\n")
 output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown block\n */\n");
 output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n")
 keys = BlockNames.keys()
 keys.sort()
 for block in keys:
     name = string.replace(block, '-', '')
     output.write("    if (!strcmp(block, \"%s\"))\n        return(xmlUCSIs%s(code));\n" %
                  (block, name));
 output.write("    return(-1);\n}\n\n")


 keys = Categories.keys()
 keys.sort()
 for name in keys:
     ranges = Categories[name]
     header.write("int\txmlUCSIsCat%s\t(int code);\n" % name)
     output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
     output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
                  (name))
     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
     start = 1
     for range in ranges:
         (begin, end) = range;
 	if start:
 	    output.write("    return(");
 	    start = 0
 	else:
 	    output.write(" ||\n           ");
 	if (begin == end):
 	    output.write("(code == %s)" % (hex(begin)))
 	else:
 	    output.write("((code >= %s) && (code <= %s))" % (
 	                 hex(begin), hex(end)))
     output.write(");\n}\n\n")

 header.write("\nint\txmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n")
 output.write("/**\n * xmlUCSIsCat:\n * @code: UCS code point\n")
 output.write(" * @cat: UCS Category name\n")
 output.write(" *\n * Check whether the caracter is part of the UCS Category\n")
 output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown category\n */\n");
 output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n")
 keys = Categories.keys()
 keys.sort()
 for name in keys:
     output.write("    if (!strcmp(cat, \"%s\"))\n        return(xmlUCSIsCat%s(code));\n" %
                  (name, name));
 output.write("    return(-1);\n}\n\n")

 header.write("""
 #ifdef __cplusplus
 }
 #endif
 #endif /* __XML_UNICODE_H__ */
 """);
 output.write("""
 #endif /* LIBXML_UNICODE_ENABLED */
 """);
 header.close()
 output.close()
	#!/usr/bin/python -u
	import sys
	import string
	import time

	sources = "Blocks-4.txt UnicodeData-3.1.0.txt"

	try:
	blocks = open("Blocks-4.txt", "r")
	except:
	print "Missing Blocks-4.txt, aborting ..."
	sys.exit(1)

	BlockNames = {}
	for line in blocks.readlines():
	if line[0] == '#':
	continue
	line = string.strip(line)
	if line == '':
	continue
	try:
	fields = string.split(line, ';')
	range = string.strip(fields[0])
	(start, end) = string.split(range, "..")
	name = string.strip(fields[1])
	name = string.replace(name, ' ', '')
	except:
	print "Failed to process line: %s" % (line)
	continue
	BlockNames[name] = ("0x"+start, "0x"+end)
	blocks.close()
	print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))

	try:
	data = open("UnicodeData-3.1.0.txt", "r")
	except:
	print "Missing UnicodeData-3.1.0.txt, aborting ..."
	sys.exit(1)

	nbchar = 0;
	Categories = {}
	for line in data.readlines():
	if line[0] == '#':
	continue
	line = string.strip(line)
	if line == '':
	continue
	try:
	fields = string.split(line, ';')
	point = string.strip(fields[0])
	value = 0
	while point != '':
	value = value * 16
	if point[0] >= '0' and point[0] <= '9':
	value = value + ord(point[0]) - ord('0')
	elif point[0] >= 'A' and point[0] <= 'F':
	value = value + 10 + ord(point[0]) - ord('A')
	elif point[0] >= 'a' and point[0] <= 'f':
	value = value + 10 + ord(point[0]) - ord('a')
	point = point[1:]
	name = fields[2]
	except:
	print "Failed to process line: %s" % (line)
	continue

	nbchar = nbchar + 1
	try:
	Categories[name].append(value)
	except:
	try:
	Categories[name] = [value]
	except:
	print "Failed to process line: %s" % (line)
	try:
	Categories[name[0]].append(value)
	except:
	try:
	Categories[name[0]] = [value]
	except:
	print "Failed to process line: %s" % (line)

	blocks.close()
	print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
	#reduce the number list into ranges
	for cat in Categories.keys():
	list = Categories[cat]
	start = -1
	prev = -1
	end = -1
	ranges = []
	for val in list:
	if start == -1:
	start = val
	prev = val
	continue
	elif val == prev + 1:
	prev = val
	continue
	elif prev == start:
	ranges.append((prev, prev))
	start = val
	prev = val
	continue
	else:
	ranges.append((start, prev))
	start = val
	prev = val
	continue
	if prev == start:
	ranges.append((prev, prev))
	else:
	ranges.append((start, prev))
	Categories[cat] = ranges

	#
	# Generate the resulting files
	#
	try:
	header = open("xmlunicode.h", "w")
	except:
	print "Failed to open xmlunicode.h"
	sys.exit(1)

	try:
	output = open("xmlunicode.c", "w")
	except:
	print "Failed to open xmlunicode.c"
	sys.exit(1)

	date = time.asctime(time.localtime(time.time()))

	header.write(
	"""/*
	* xmlunicode.h: this header exports interfaces for the Unicode character APIs
	*
	* This file is automatically generated from the
	* UCS description files of the Unicode Character Database
	* http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
	* using the genUnicode.py Python script.
	*
	* Generation date: %s
	* Sources: %s
	* Daniel Veillard <veillard@redhat.com>
	*/

	#ifndef __XML_UNICODE_H__
	#define __XML_UNICODE_H__

	#ifdef __cplusplus
	extern "C" {
	#endif

	""" % (date, sources));
	output.write(
	"""/*
	* xmlunicode.c: this module implements the Unicode character APIs
	*
	* This file is automatically generated from the
	* UCS description files of the Unicode Character Database
	* http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
	* using the genUnicode.py Python script.
	*
	* Generation date: %s
	* Sources: %s
	* Daniel Veillard <veillard@redhat.com>
	*/

	#define IN_LIBXML
	#include "libxml.h"

	#ifdef LIBXML_UNICODE_ENABLED

	#include <string.h>
	#include <libxml/xmlversion.h>
	#include <libxml/xmlunicode.h>

	""" % (date, sources));

	keys = BlockNames.keys()
	keys.sort()
	for block in keys:
	(start, end) = BlockNames[block]
	name = string.replace(block, '-', '')
	header.write("int\txmlUCSIs%s\t(int code);\n" % name)
	output.write("/*\n xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
	output.write(" \n Check whether the character is part of %s UCS Block\n"%
	(block))
	output.write(" \n Returns 1 if true 0 otherwise\n */\n");
	output.write("int\nxmlUCSIs%s(int code) {\n" % name)
	output.write(" return((code >= %s) && (code <= %s));\n" % (start, end))
	output.write("}\n\n")

	header.write("\nint\txmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n")
	output.write("/*\n xmlUCSIsBlock:\n * @code: UCS code point\n")
	output.write(" * @block: UCS block name\n")
	output.write(" \n Check whether the caracter is part of the UCS Block\n")
	output.write(" \n Returns 1 if true, 0 if false and -1 on unknown block\n */\n");
	output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n")
	keys = BlockNames.keys()
	keys.sort()
	for block in keys:
	name = string.replace(block, '-', '')
	output.write(" if (!strcmp(block, \"%s\"))\n return(xmlUCSIs%s(code));\n" %
	(block, name));
	output.write(" return(-1);\n}\n\n")


	keys = Categories.keys()
	keys.sort()
	for name in keys:
	ranges = Categories[name]
	header.write("int\txmlUCSIsCat%s\t(int code);\n" % name)
	output.write("/*\n xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
	output.write(" \n Check whether the character is part of %s UCS Category\n"%
	(name))
	output.write(" \n Returns 1 if true 0 otherwise\n */\n");
	output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
	start = 1
	for range in ranges:
	(begin, end) = range;
	if start:
	output.write(" return(");
	start = 0
	else:
	output.write(" \|\|\n ");
	if (begin == end):
	output.write("(code == %s)" % (hex(begin)))
	else:
	output.write("((code >= %s) && (code <= %s))" % (
	hex(begin), hex(end)))
	output.write(");\n}\n\n")

	header.write("\nint\txmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n")
	output.write("/*\n xmlUCSIsCat:\n * @code: UCS code point\n")
	output.write(" * @cat: UCS Category name\n")
	output.write(" \n Check whether the caracter is part of the UCS Category\n")
	output.write(" \n Returns 1 if true, 0 if false and -1 on unknown category\n */\n");
	output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n")
	keys = Categories.keys()
	keys.sort()
	for name in keys:
	output.write(" if (!strcmp(cat, \"%s\"))\n return(xmlUCSIsCat%s(code));\n" %
	(name, name));
	output.write(" return(-1);\n}\n\n")

	header.write("""
	#ifdef __cplusplus
	}
	#endif
	#endif /* __XML_UNICODE_H__ */
	""");
	output.write("""
	#endif /* LIBXML_UNICODE_ENABLED */
	""");
	header.close()
	output.close()