mypyc/irbuild/format_str_tokenizer.py - third_party/github.com/python/mypy - Git at Google

 """Tokenizers for three string formatting methods"""

 from __future__ import annotations

 from enum import Enum, unique
 from typing import Final

 from mypy.checkstrformat import (
     ConversionSpecifier,
     parse_conversion_specifiers,
     parse_format_value,
 )
 from mypy.errors import Errors
 from mypy.messages import MessageBuilder
 from mypy.nodes import Context, Expression
 from mypy.options import Options
 from mypyc.ir.ops import Integer, Value
 from mypyc.ir.rtypes import (
     c_pyssize_t_rprimitive,
     is_bytes_rprimitive,
     is_int_rprimitive,
     is_short_int_rprimitive,
     is_str_rprimitive,
 )
 from mypyc.irbuild.builder import IRBuilder
 from mypyc.primitives.bytes_ops import bytes_build_op
 from mypyc.primitives.int_ops import int_to_str_op
 from mypyc.primitives.str_ops import str_build_op, str_op


 @unique
 class FormatOp(Enum):
     """FormatOp represents conversion operations of string formatting during
     compile time.

     Compare to ConversionSpecifier, FormatOp has fewer attributes.
     For example, to mark a conversion from any object to string,
     ConversionSpecifier may have several representations, like '%s', '{}'
     or '{:{}}'. However, there would only exist one corresponding FormatOp.
     """

     STR = "s"
     INT = "d"
     BYTES = "b"


 def generate_format_ops(specifiers: list[ConversionSpecifier]) -> list[FormatOp] | None:
     """Convert ConversionSpecifier to FormatOp.

     Different ConversionSpecifiers may share a same FormatOp.
     """
     format_ops = []
     for spec in specifiers:
         # TODO: Match specifiers instead of using whole_seq
         if spec.whole_seq == "%s" or spec.whole_seq == "{:{}}":
             format_op = FormatOp.STR
         elif spec.whole_seq == "%d":
             format_op = FormatOp.INT
         elif spec.whole_seq == "%b":
             format_op = FormatOp.BYTES
         elif spec.whole_seq:
             return None
         else:
             format_op = FormatOp.STR
         format_ops.append(format_op)
     return format_ops


 def tokenizer_printf_style(format_str: str) -> tuple[list[str], list[FormatOp]] | None:
     """Tokenize a printf-style format string using regex.

     Return:
         A list of string literals and a list of FormatOps.
     """
     literals: list[str] = []
     specifiers: list[ConversionSpecifier] = parse_conversion_specifiers(format_str)
     format_ops = generate_format_ops(specifiers)
     if format_ops is None:
         return None

     last_end = 0
     for spec in specifiers:
         cur_start = spec.start_pos
         literals.append(format_str[last_end:cur_start])
         last_end = cur_start + len(spec.whole_seq)
     literals.append(format_str[last_end:])

     return literals, format_ops


 # The empty Context as an argument for parse_format_value().
 # It wouldn't be used since the code has passed the type-checking.
 EMPTY_CONTEXT: Final = Context()


 def tokenizer_format_call(format_str: str) -> tuple[list[str], list[FormatOp]] | None:
     """Tokenize a str.format() format string.

     The core function parse_format_value() is shared with mypy.
     With these specifiers, we then parse the literal substrings
     of the original format string and convert `ConversionSpecifier`
     to `FormatOp`.

     Return:
         A list of string literals and a list of FormatOps. The literals
         are interleaved with FormatOps and the length of returned literals
         should be exactly one more than FormatOps.
         Return None if it cannot parse the string.
     """
     # Creates an empty MessageBuilder here.
     # It wouldn't be used since the code has passed the type-checking.
     specifiers = parse_format_value(
         format_str, EMPTY_CONTEXT, MessageBuilder(Errors(Options()), {})
     )
     if specifiers is None:
         return None
     format_ops = generate_format_ops(specifiers)
     if format_ops is None:
         return None

     literals: list[str] = []
     last_end = 0
     for spec in specifiers:
         # Skip { and }
         literals.append(format_str[last_end : spec.start_pos - 1])
         last_end = spec.start_pos + len(spec.whole_seq) + 1
     literals.append(format_str[last_end:])
     # Deal with escaped {{
     literals = [x.replace("{{", "{").replace("}}", "}") for x in literals]

     return literals, format_ops


 def convert_format_expr_to_str(
     builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
 ) -> list[Value] | None:
     """Convert expressions into string literal objects with the guidance
     of FormatOps. Return None when fails."""
     if len(format_ops) != len(exprs):
         return None

     converted = []
     for x, format_op in zip(exprs, format_ops):
         node_type = builder.node_type(x)
         if format_op == FormatOp.STR:
             if is_str_rprimitive(node_type):
                 var_str = builder.accept(x)
             elif is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
                 var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
             else:
                 var_str = builder.call_c(str_op, [builder.accept(x)], line)
         elif format_op == FormatOp.INT:
             if is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
                 var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
             else:
                 return None
         else:
             return None
         converted.append(var_str)
     return converted


 def join_formatted_strings(
     builder: IRBuilder, literals: list[str] | None, substitutions: list[Value], line: int
 ) -> Value:
     """Merge the list of literals and the list of substitutions
     alternatively using 'str_build_op'.

     `substitutions` is the result value of formatting conversions.

     If the `literals` is set to None, we simply join the substitutions;
     Otherwise, the `literals` is the literal substrings of the original
     format string and its length should be exactly one more than
     substitutions.

     For example:
     (1)    'This is a %s and the value is %d'
         -> literals: ['This is a ', ' and the value is', '']
     (2)    '{} and the value is {}'
         -> literals: ['', ' and the value is', '']
     """
     # The first parameter for str_build_op is the total size of
     # the following PyObject*
     result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]

     if literals is not None:
         for a, b in zip(literals, substitutions):
             if a:
                 result_list.append(builder.load_str(a))
             result_list.append(b)
         if literals[-1]:
             result_list.append(builder.load_str(literals[-1]))
     else:
         result_list.extend(substitutions)

     # Special case for empty string and literal string
     if len(result_list) == 1:
         return builder.load_str("")
     if not substitutions and len(result_list) == 2:
         return result_list[1]

     result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
     return builder.call_c(str_build_op, result_list, line)


 def convert_format_expr_to_bytes(
     builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
 ) -> list[Value] | None:
     """Convert expressions into bytes literal objects with the guidance
     of FormatOps. Return None when fails."""
     if len(format_ops) != len(exprs):
         return None

     converted = []
     for x, format_op in zip(exprs, format_ops):
         node_type = builder.node_type(x)
         # conversion type 's' is an alias of 'b' in bytes formatting
         if format_op == FormatOp.BYTES or format_op == FormatOp.STR:
             if is_bytes_rprimitive(node_type):
                 var_bytes = builder.accept(x)
             else:
                 return None
         else:
             return None
         converted.append(var_bytes)
     return converted


 def join_formatted_bytes(
     builder: IRBuilder, literals: list[str], substitutions: list[Value], line: int
 ) -> Value:
     """Merge the list of literals and the list of substitutions
     alternatively using 'bytes_build_op'."""
     result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]

     for a, b in zip(literals, substitutions):
         if a:
             result_list.append(builder.load_bytes_from_str_literal(a))
         result_list.append(b)
     if literals[-1]:
         result_list.append(builder.load_bytes_from_str_literal(literals[-1]))

     # Special case for empty bytes and literal
     if len(result_list) == 1:
         return builder.load_bytes_from_str_literal("")
     if not substitutions and len(result_list) == 2:
         return result_list[1]

     result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
     return builder.call_c(bytes_build_op, result_list, line)
	"""Tokenizers for three string formatting methods"""

	from __future__ import annotations

	from enum import Enum, unique
	from typing import Final

	from mypy.checkstrformat import (
	ConversionSpecifier,
	parse_conversion_specifiers,
	parse_format_value,
	)
	from mypy.errors import Errors
	from mypy.messages import MessageBuilder
	from mypy.nodes import Context, Expression
	from mypy.options import Options
	from mypyc.ir.ops import Integer, Value
	from mypyc.ir.rtypes import (
	c_pyssize_t_rprimitive,
	is_bytes_rprimitive,
	is_int_rprimitive,
	is_short_int_rprimitive,
	is_str_rprimitive,
	)
	from mypyc.irbuild.builder import IRBuilder
	from mypyc.primitives.bytes_ops import bytes_build_op
	from mypyc.primitives.int_ops import int_to_str_op
	from mypyc.primitives.str_ops import str_build_op, str_op


	@unique
	class FormatOp(Enum):
	"""FormatOp represents conversion operations of string formatting during
	compile time.

	Compare to ConversionSpecifier, FormatOp has fewer attributes.
	For example, to mark a conversion from any object to string,
	ConversionSpecifier may have several representations, like '%s', '{}'
	or '{:{}}'. However, there would only exist one corresponding FormatOp.
	"""

	STR = "s"
	INT = "d"
	BYTES = "b"


	def generate_format_ops(specifiers: list[ConversionSpecifier]) -> list[FormatOp] \| None:
	"""Convert ConversionSpecifier to FormatOp.

	Different ConversionSpecifiers may share a same FormatOp.
	"""
	format_ops = []
	for spec in specifiers:
	# TODO: Match specifiers instead of using whole_seq
	if spec.whole_seq == "%s" or spec.whole_seq == "{:{}}":
	format_op = FormatOp.STR
	elif spec.whole_seq == "%d":
	format_op = FormatOp.INT
	elif spec.whole_seq == "%b":
	format_op = FormatOp.BYTES
	elif spec.whole_seq:
	return None
	else:
	format_op = FormatOp.STR
	format_ops.append(format_op)
	return format_ops


	def tokenizer_printf_style(format_str: str) -> tuple[list[str], list[FormatOp]] \| None:
	"""Tokenize a printf-style format string using regex.

	Return:
	A list of string literals and a list of FormatOps.
	"""
	literals: list[str] = []
	specifiers: list[ConversionSpecifier] = parse_conversion_specifiers(format_str)
	format_ops = generate_format_ops(specifiers)
	if format_ops is None:
	return None

	last_end = 0
	for spec in specifiers:
	cur_start = spec.start_pos
	literals.append(format_str[last_end:cur_start])
	last_end = cur_start + len(spec.whole_seq)
	literals.append(format_str[last_end:])

	return literals, format_ops


	# The empty Context as an argument for parse_format_value().
	# It wouldn't be used since the code has passed the type-checking.
	EMPTY_CONTEXT: Final = Context()


	def tokenizer_format_call(format_str: str) -> tuple[list[str], list[FormatOp]] \| None:
	"""Tokenize a str.format() format string.

	The core function parse_format_value() is shared with mypy.
	With these specifiers, we then parse the literal substrings
	of the original format string and convert `ConversionSpecifier`
	to `FormatOp`.

	Return:
	A list of string literals and a list of FormatOps. The literals
	are interleaved with FormatOps and the length of returned literals
	should be exactly one more than FormatOps.
	Return None if it cannot parse the string.
	"""
	# Creates an empty MessageBuilder here.
	# It wouldn't be used since the code has passed the type-checking.
	specifiers = parse_format_value(
	format_str, EMPTY_CONTEXT, MessageBuilder(Errors(Options()), {})
	)
	if specifiers is None:
	return None
	format_ops = generate_format_ops(specifiers)
	if format_ops is None:
	return None

	literals: list[str] = []
	last_end = 0
	for spec in specifiers:
	# Skip { and }
	literals.append(format_str[last_end : spec.start_pos - 1])
	last_end = spec.start_pos + len(spec.whole_seq) + 1
	literals.append(format_str[last_end:])
	# Deal with escaped {{
	literals = [x.replace("{{", "{").replace("}}", "}") for x in literals]

	return literals, format_ops


	def convert_format_expr_to_str(
	builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
	) -> list[Value] \| None:
	"""Convert expressions into string literal objects with the guidance
	of FormatOps. Return None when fails."""
	if len(format_ops) != len(exprs):
	return None

	converted = []
	for x, format_op in zip(exprs, format_ops):
	node_type = builder.node_type(x)
	if format_op == FormatOp.STR:
	if is_str_rprimitive(node_type):
	var_str = builder.accept(x)
	elif is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
	var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
	else:
	var_str = builder.call_c(str_op, [builder.accept(x)], line)
	elif format_op == FormatOp.INT:
	if is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
	var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
	else:
	return None
	else:
	return None
	converted.append(var_str)
	return converted


	def join_formatted_strings(
	builder: IRBuilder, literals: list[str] \| None, substitutions: list[Value], line: int
	) -> Value:
	"""Merge the list of literals and the list of substitutions
	alternatively using 'str_build_op'.

	`substitutions` is the result value of formatting conversions.

	If the `literals` is set to None, we simply join the substitutions;
	Otherwise, the `literals` is the literal substrings of the original
	format string and its length should be exactly one more than
	substitutions.

	For example:
	(1) 'This is a %s and the value is %d'
	-> literals: ['This is a ', ' and the value is', '']
	(2) '{} and the value is {}'
	-> literals: ['', ' and the value is', '']
	"""
	# The first parameter for str_build_op is the total size of
	# the following PyObject*
	result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]

	if literals is not None:
	for a, b in zip(literals, substitutions):
	if a:
	result_list.append(builder.load_str(a))
	result_list.append(b)
	if literals[-1]:
	result_list.append(builder.load_str(literals[-1]))
	else:
	result_list.extend(substitutions)

	# Special case for empty string and literal string
	if len(result_list) == 1:
	return builder.load_str("")
	if not substitutions and len(result_list) == 2:
	return result_list[1]

	result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
	return builder.call_c(str_build_op, result_list, line)


	def convert_format_expr_to_bytes(
	builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
	) -> list[Value] \| None:
	"""Convert expressions into bytes literal objects with the guidance
	of FormatOps. Return None when fails."""
	if len(format_ops) != len(exprs):
	return None

	converted = []
	for x, format_op in zip(exprs, format_ops):
	node_type = builder.node_type(x)
	# conversion type 's' is an alias of 'b' in bytes formatting
	if format_op == FormatOp.BYTES or format_op == FormatOp.STR:
	if is_bytes_rprimitive(node_type):
	var_bytes = builder.accept(x)
	else:
	return None
	else:
	return None
	converted.append(var_bytes)
	return converted


	def join_formatted_bytes(
	builder: IRBuilder, literals: list[str], substitutions: list[Value], line: int
	) -> Value:
	"""Merge the list of literals and the list of substitutions
	alternatively using 'bytes_build_op'."""
	result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]

	for a, b in zip(literals, substitutions):
	if a:
	result_list.append(builder.load_bytes_from_str_literal(a))
	result_list.append(b)
	if literals[-1]:
	result_list.append(builder.load_bytes_from_str_literal(literals[-1]))

	# Special case for empty bytes and literal
	if len(result_list) == 1:
	return builder.load_bytes_from_str_literal("")
	if not substitutions and len(result_list) == 2:
	return result_list[1]

	result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
	return builder.call_c(bytes_build_op, result_list, line)