| """Parsing/inferring signatures from documentation. |
| |
| This module provides several functions to generate better stubs using |
| docstrings and Sphinx docs (.rst files). |
| """ |
| import re |
| import io |
| import contextlib |
| import tokenize |
| |
| from typing import ( |
| Optional, MutableMapping, MutableSequence, List, Sequence, Tuple, NamedTuple, Any |
| ) |
| from typing_extensions import Final |
| |
| # Type alias for signatures strings in format ('func_name', '(arg, opt_arg=False)'). |
| Sig = Tuple[str, str] |
| |
| |
| _TYPE_RE: Final = re.compile(r"^[a-zA-Z_][\w\[\], ]*(\.[a-zA-Z_][\w\[\], ]*)*$") |
| _ARG_NAME_RE: Final = re.compile(r"\**[A-Za-z_][A-Za-z0-9_]*$") |
| |
| |
| def is_valid_type(s: str) -> bool: |
| """Try to determine whether a string might be a valid type annotation.""" |
| if s in ('True', 'False', 'retval'): |
| return False |
| if ',' in s and '[' not in s: |
| return False |
| return _TYPE_RE.match(s) is not None |
| |
| |
| class ArgSig: |
| """Signature info for a single argument.""" |
| |
| def __init__(self, name: str, type: Optional[str] = None, default: bool = False): |
| self.name = name |
| if type and not is_valid_type(type): |
| raise ValueError("Invalid type: " + type) |
| self.type = type |
| # Does this argument have a default value? |
| self.default = default |
| |
| def __repr__(self) -> str: |
| return "ArgSig(name={}, type={}, default={})".format(repr(self.name), repr(self.type), |
| repr(self.default)) |
| |
| def __eq__(self, other: Any) -> bool: |
| if isinstance(other, ArgSig): |
| return (self.name == other.name and self.type == other.type and |
| self.default == other.default) |
| return False |
| |
| |
| FunctionSig = NamedTuple('FunctionSig', [ |
| ('name', str), |
| ('args', List[ArgSig]), |
| ('ret_type', str) |
| ]) |
| |
| |
| # States of the docstring parser. |
| STATE_INIT: Final = 1 |
| STATE_FUNCTION_NAME: Final = 2 |
| STATE_ARGUMENT_LIST: Final = 3 |
| STATE_ARGUMENT_TYPE: Final = 4 |
| STATE_ARGUMENT_DEFAULT: Final = 5 |
| STATE_RETURN_VALUE: Final = 6 |
| STATE_OPEN_BRACKET: Final = 7 # For generic types. |
| |
| |
| class DocStringParser: |
| """Parse function signatures in documentation.""" |
| |
| def __init__(self, function_name: str) -> None: |
| # Only search for signatures of function with this name. |
| self.function_name = function_name |
| self.state = [STATE_INIT] |
| self.accumulator = "" |
| self.arg_type: Optional[str] = None |
| self.arg_name = "" |
| self.arg_default: Optional[str] = None |
| self.ret_type = "Any" |
| self.found = False |
| self.args: List[ArgSig] = [] |
| # Valid signatures found so far. |
| self.signatures: List[FunctionSig] = [] |
| |
| def add_token(self, token: tokenize.TokenInfo) -> None: |
| """Process next token from the token stream.""" |
| if (token.type == tokenize.NAME and token.string == self.function_name and |
| self.state[-1] == STATE_INIT): |
| self.state.append(STATE_FUNCTION_NAME) |
| |
| elif (token.type == tokenize.OP and token.string == '(' and |
| self.state[-1] == STATE_FUNCTION_NAME): |
| self.state.pop() |
| self.accumulator = "" |
| self.found = True |
| self.state.append(STATE_ARGUMENT_LIST) |
| |
| elif self.state[-1] == STATE_FUNCTION_NAME: |
| # Reset state, function name not followed by '('. |
| self.state.pop() |
| |
| elif (token.type == tokenize.OP and token.string in ('[', '(', '{') and |
| self.state[-1] != STATE_INIT): |
| self.accumulator += token.string |
| self.state.append(STATE_OPEN_BRACKET) |
| |
| elif (token.type == tokenize.OP and token.string in (']', ')', '}') and |
| self.state[-1] == STATE_OPEN_BRACKET): |
| self.accumulator += token.string |
| self.state.pop() |
| |
| elif (token.type == tokenize.OP and token.string == ':' and |
| self.state[-1] == STATE_ARGUMENT_LIST): |
| self.arg_name = self.accumulator |
| self.accumulator = "" |
| self.state.append(STATE_ARGUMENT_TYPE) |
| |
| elif (token.type == tokenize.OP and token.string == '=' and |
| self.state[-1] in (STATE_ARGUMENT_LIST, STATE_ARGUMENT_TYPE)): |
| if self.state[-1] == STATE_ARGUMENT_TYPE: |
| self.arg_type = self.accumulator |
| self.state.pop() |
| else: |
| self.arg_name = self.accumulator |
| self.accumulator = "" |
| self.state.append(STATE_ARGUMENT_DEFAULT) |
| |
| elif (token.type == tokenize.OP and token.string in (',', ')') and |
| self.state[-1] in (STATE_ARGUMENT_LIST, STATE_ARGUMENT_DEFAULT, |
| STATE_ARGUMENT_TYPE)): |
| if self.state[-1] == STATE_ARGUMENT_DEFAULT: |
| self.arg_default = self.accumulator |
| self.state.pop() |
| elif self.state[-1] == STATE_ARGUMENT_TYPE: |
| self.arg_type = self.accumulator |
| self.state.pop() |
| elif self.state[-1] == STATE_ARGUMENT_LIST: |
| self.arg_name = self.accumulator |
| if not (token.string == ')' and self.accumulator.strip() == '') \ |
| and not _ARG_NAME_RE.match(self.arg_name): |
| # Invalid argument name. |
| self.reset() |
| return |
| |
| if token.string == ')': |
| self.state.pop() |
| |
| # arg_name is empty when there are no args. e.g. func() |
| if self.arg_name: |
| try: |
| self.args.append(ArgSig(name=self.arg_name, type=self.arg_type, |
| default=bool(self.arg_default))) |
| except ValueError: |
| # wrong type, use Any |
| self.args.append(ArgSig(name=self.arg_name, type=None, |
| default=bool(self.arg_default))) |
| self.arg_name = "" |
| self.arg_type = None |
| self.arg_default = None |
| self.accumulator = "" |
| |
| elif token.type == tokenize.OP and token.string == '->' and self.state[-1] == STATE_INIT: |
| self.accumulator = "" |
| self.state.append(STATE_RETURN_VALUE) |
| |
| # ENDMAKER is necessary for python 3.4 and 3.5. |
| elif (token.type in (tokenize.NEWLINE, tokenize.ENDMARKER) and |
| self.state[-1] in (STATE_INIT, STATE_RETURN_VALUE)): |
| if self.state[-1] == STATE_RETURN_VALUE: |
| if not is_valid_type(self.accumulator): |
| self.reset() |
| return |
| self.ret_type = self.accumulator |
| self.accumulator = "" |
| self.state.pop() |
| |
| if self.found: |
| self.signatures.append(FunctionSig(name=self.function_name, args=self.args, |
| ret_type=self.ret_type)) |
| self.found = False |
| self.args = [] |
| self.ret_type = 'Any' |
| # Leave state as INIT. |
| else: |
| self.accumulator += token.string |
| |
| def reset(self) -> None: |
| self.state = [STATE_INIT] |
| self.args = [] |
| self.found = False |
| self.accumulator = "" |
| |
| def get_signatures(self) -> List[FunctionSig]: |
| """Return sorted copy of the list of signatures found so far.""" |
| def has_arg(name: str, signature: FunctionSig) -> bool: |
| return any(x.name == name for x in signature.args) |
| |
| def args_kwargs(signature: FunctionSig) -> bool: |
| return has_arg('*args', signature) and has_arg('**kwargs', signature) |
| |
| # Move functions with (*args, **kwargs) in their signature to last place. |
| return list(sorted(self.signatures, key=lambda x: 1 if args_kwargs(x) else 0)) |
| |
| |
| def infer_sig_from_docstring(docstr: Optional[str], name: str) -> Optional[List[FunctionSig]]: |
| """Convert function signature to list of TypedFunctionSig |
| |
| Look for function signatures of function in docstring. Signature is a string of |
| the format <function_name>(<signature>) -> <return type> or perhaps without |
| the return type. |
| |
| Returns empty list, when no signature is found, one signature in typical case, |
| multiple signatures, if docstring specifies multiple signatures for overload functions. |
| Return None if the docstring is empty. |
| |
| Arguments: |
| * docstr: docstring |
| * name: name of function for which signatures are to be found |
| """ |
| if not docstr: |
| return None |
| |
| state = DocStringParser(name) |
| # Return all found signatures, even if there is a parse error after some are found. |
| with contextlib.suppress(tokenize.TokenError): |
| try: |
| tokens = tokenize.tokenize(io.BytesIO(docstr.encode('utf-8')).readline) |
| for token in tokens: |
| state.add_token(token) |
| except IndentationError: |
| return None |
| sigs = state.get_signatures() |
| |
| def is_unique_args(sig: FunctionSig) -> bool: |
| """return true if function argument names are unique""" |
| return len(sig.args) == len(set((arg.name for arg in sig.args))) |
| |
| # Return only signatures that have unique argument names. Mypy fails on non-unique arg names. |
| return [sig for sig in sigs if is_unique_args(sig)] |
| |
| |
| def infer_arg_sig_from_anon_docstring(docstr: str) -> List[ArgSig]: |
| """Convert signature in form of "(self: TestClass, arg0: str='ada')" to List[TypedArgList].""" |
| ret = infer_sig_from_docstring("stub" + docstr, "stub") |
| if ret: |
| return ret[0].args |
| return [] |
| |
| |
| def infer_ret_type_sig_from_docstring(docstr: str, name: str) -> Optional[str]: |
| """Convert signature in form of "func(self: TestClass, arg0) -> int" to their return type.""" |
| ret = infer_sig_from_docstring(docstr, name) |
| if ret: |
| return ret[0].ret_type |
| return None |
| |
| |
| def infer_ret_type_sig_from_anon_docstring(docstr: str) -> Optional[str]: |
| """Convert signature in form of "(self: TestClass, arg0) -> int" to their return type.""" |
| return infer_ret_type_sig_from_docstring("stub" + docstr.strip(), "stub") |
| |
| |
| def parse_signature(sig: str) -> Optional[Tuple[str, |
| List[str], |
| List[str]]]: |
| """Split function signature into its name, positional an optional arguments. |
| |
| The expected format is "func_name(arg, opt_arg=False)". Return the name of function |
| and lists of positional and optional argument names. |
| """ |
| m = re.match(r'([.a-zA-Z0-9_]+)\(([^)]*)\)', sig) |
| if not m: |
| return None |
| name = m.group(1) |
| name = name.split('.')[-1] |
| arg_string = m.group(2) |
| if not arg_string.strip(): |
| # Simple case -- no arguments. |
| return name, [], [] |
| |
| args = [arg.strip() for arg in arg_string.split(',')] |
| positional = [] |
| optional = [] |
| i = 0 |
| while i < len(args): |
| # Accept optional arguments as in both formats: x=None and [x]. |
| if args[i].startswith('[') or '=' in args[i]: |
| break |
| positional.append(args[i].rstrip('[')) |
| i += 1 |
| if args[i - 1].endswith('['): |
| break |
| while i < len(args): |
| arg = args[i] |
| arg = arg.strip('[]') |
| arg = arg.split('=')[0] |
| optional.append(arg) |
| i += 1 |
| return name, positional, optional |
| |
| |
| def build_signature(positional: Sequence[str], |
| optional: Sequence[str]) -> str: |
| """Build function signature from lists of positional and optional argument names.""" |
| args: MutableSequence[str] = [] |
| args.extend(positional) |
| for arg in optional: |
| if arg.startswith('*'): |
| args.append(arg) |
| else: |
| args.append('%s=...' % arg) |
| sig = '(%s)' % ', '.join(args) |
| # Ad-hoc fixes. |
| sig = sig.replace('(self)', '') |
| return sig |
| |
| |
| def parse_all_signatures(lines: Sequence[str]) -> Tuple[List[Sig], |
| List[Sig]]: |
| """Parse all signatures in a given reST document. |
| |
| Return lists of found signatures for functions and classes. |
| """ |
| sigs = [] |
| class_sigs = [] |
| for line in lines: |
| line = line.strip() |
| m = re.match(r'\.\. *(function|method|class) *:: *[a-zA-Z_]', line) |
| if m: |
| sig = line.split('::')[1].strip() |
| parsed = parse_signature(sig) |
| if parsed: |
| name, fixed, optional = parsed |
| if m.group(1) != 'class': |
| sigs.append((name, build_signature(fixed, optional))) |
| else: |
| class_sigs.append((name, build_signature(fixed, optional))) |
| |
| return sorted(sigs), sorted(class_sigs) |
| |
| |
| def find_unique_signatures(sigs: Sequence[Sig]) -> List[Sig]: |
| """Remove names with duplicate found signatures.""" |
| sig_map: MutableMapping[str, List[str]] = {} |
| for name, sig in sigs: |
| sig_map.setdefault(name, []).append(sig) |
| |
| result = [] |
| for name, name_sigs in sig_map.items(): |
| if len(set(name_sigs)) == 1: |
| result.append((name, name_sigs[0])) |
| return sorted(result) |
| |
| |
| def infer_prop_type_from_docstring(docstr: Optional[str]) -> Optional[str]: |
| """Check for Google/Numpy style docstring type annotation for a property. |
| |
| The docstring has the format "<type>: <descriptions>". |
| In the type string, we allow the following characters: |
| * dot: because sometimes classes are annotated using full path |
| * brackets: to allow type hints like List[int] |
| * comma/space: things like Tuple[int, int] |
| """ |
| if not docstr: |
| return None |
| test_str = r'^([a-zA-Z0-9_, \.\[\]]*): ' |
| m = re.match(test_str, docstr) |
| return m.group(1) if m else None |