Source code for bpc_utils.parsing

"""Functions for parsing Python source code."""

import glob
import io
import os
import re
import token
import tokenize

import parso

from .misc import MakeTextIO, first_non_none
from .typing import TYPE_CHECKING, cast

if TYPE_CHECKING:
    from .typing import Dict, Linesep, List, Literal, Optional, TextIO, Tuple, Union

PARSO_GRAMMAR_VERSIONS = []  # type: List[Tuple[int, int]]
for grammar_file in glob.iglob(os.path.join(parso.__path__[0], 'python', 'grammar*.txt')):  # type: ignore[attr-defined]
    grammar_version = os.path.basename(grammar_file)[7:-4]
    PARSO_GRAMMAR_VERSIONS.append((int(grammar_version[0]), int(grammar_version[1:])))
PARSO_GRAMMAR_VERSIONS = sorted(PARSO_GRAMMAR_VERSIONS)


[docs]def get_parso_grammar_versions(minimum: 'Optional[str]' = None) -> 'List[str]': """Get Python versions that parso supports to parse grammar. Args: minimum: filter result by this minimum version Returns: a list of Python versions that parso supports to parse grammar Raises: TypeError: if ``minimum`` is not :obj:`str` ValueError: if ``minimum`` is invalid """ if minimum is None: minimum_tuple = () # type: Tuple[int, ...] else: if not isinstance(minimum, str): raise TypeError('minimum version should be a string') if not re.fullmatch(r'(?:0|[1-9][0-9]*)\.(?:0|[1-9][0-9]*)', minimum): raise ValueError('invalid minimum version') minimum_tuple = tuple(map(int, minimum.split('.'))) return ['{}.{}'.format(*v) for v in PARSO_GRAMMAR_VERSIONS if v >= minimum_tuple]
[docs]class BPCSyntaxError(SyntaxError): """Syntax error detected when parsing code."""
[docs]def detect_encoding(code: bytes) -> str: """Detect encoding of Python source code as specified in :pep:`263`. Args: code: the code to detect encoding Returns: the detected encoding, or the default encoding (``utf-8``) Raises: TypeError: if ``code`` is not a :obj:`bytes` string SyntaxError: if both a BOM and a cookie are present, but disagree """ if not isinstance(code, bytes): raise TypeError("'code' should be bytes") with io.BytesIO(code) as file: return tokenize.detect_encoding(file.readline)[0]
[docs]def detect_linesep(code: 'Union[str, bytes, TextIO, parso.tree.NodeOrLeaf]') -> 'Linesep': r"""Detect linesep of Python source code. Args: code: the code to detect linesep Returns: :data:`~bpc_utils.Linesep`: the detected linesep (one of ``'\n'``, ``'\r\n'`` and ``'\r'``) Notes: In case of mixed linesep, try voting by the number of occurrences of each linesep value. When there is a tie, prefer ``LF`` to ``CRLF``, prefer ``CRLF`` to ``CR``. """ if isinstance(code, parso.tree.NodeOrLeaf): code = code.get_code() if isinstance(code, bytes): code = code.decode(detect_encoding(code)) pool = { 'CR': 0, 'CRLF': 0, 'LF': 0, } # type: Dict[Literal['CR', 'CRLF', 'LF'], int] with MakeTextIO(cast('Union[str, TextIO]', code)) as file: for line in file: if line.endswith('\r'): pool['CR'] += 1 elif line.endswith('\r\n'): pool['CRLF'] += 1 elif line.endswith('\n'): pool['LF'] += 1 # when there is a tie, prefer LF to CRLF, prefer CRLF to CR return cast('Linesep', max((pool['LF'], 3, '\n'), (pool['CRLF'], 2, '\r\n'), (pool['CR'], 1, '\r'))[2])
[docs]def detect_indentation(code: 'Union[str, bytes, TextIO, parso.tree.NodeOrLeaf]') -> str: """Detect indentation of Python source code. Args: code: the code to detect indentation Returns: the detected indentation sequence Raises: :exc:`~tokenize.TokenError`: when failed to tokenize the source code under certain cases, see documentation of :exc:`~tokenize.TokenError` for more details Notes: In case of mixed indentation, try voting by the number of occurrences of each indentation value (*spaces* and *tabs*). When there is a tie between *spaces* and *tabs*, prefer **4 spaces** for :pep:`8`. """ if isinstance(code, parso.tree.NodeOrLeaf): code = code.get_code() if isinstance(code, bytes): code = code.decode(detect_encoding(code)) pool = { 'space': 0, 'tab': 0 } # type: Dict[Literal['space', 'tab'], int] min_spaces = None # type: Optional[int] with MakeTextIO(cast('Union[str, TextIO]', code)) as file: for token_info in tokenize.generate_tokens(file.readline): if token_info.type == token.INDENT: if '\t' in token_info.string and ' ' in token_info.string: continue # skip indentation with mixed spaces and tabs if '\t' in token_info.string: pool['tab'] += 1 else: pool['space'] += 1 if min_spaces is None: min_spaces = len(token_info.string) else: min_spaces = min(min_spaces, len(token_info.string)) if pool['space'] > pool['tab']: return ' ' * cast(int, min_spaces) if pool['space'] < pool['tab']: return '\t' return ' ' * 4 # same number of spaces and tabs, prefer 4 spaces for PEP 8
[docs]def parso_parse(code: 'Union[str, bytes]', filename: 'Optional[str]' = None, *, version: 'Optional[str]' = None) -> 'parso.python.tree.Module': """Parse Python source code with parso. Args: code: the code to be parsed filename: an optional source file name to provide a context in case of error version: parse the code as this version (uses the latest version by default) Returns: parso AST Raises: BPCSyntaxError: when source code contains syntax errors """ filename = first_non_none(filename, '<unknown>') grammar = parso.load_grammar(version=version if version is not None else get_parso_grammar_versions()[-1]) if isinstance(code, bytes): try: code = code.decode(detect_encoding(code)) except SyntaxError as e: raise BPCSyntaxError('failed to detect encoding for source file %r: %s' % (filename, e)) from None module = grammar.parse(code, error_recovery=True) # type: parso.python.tree.Module errors = grammar.iter_errors(module) if errors: error_messages = '\n'.join('[L%dC%d] %s' % (error.start_pos + (error.message,)) for error in errors) raise BPCSyntaxError('source file %r contains the following syntax errors:\n%s' % (filename, error_messages)) return module
__all__ = ['get_parso_grammar_versions', 'BPCSyntaxError', 'detect_encoding', 'detect_linesep', 'detect_indentation', 'parso_parse']