Source code for bpc_utils.parsing

"""Functions for parsing Python source code."""

import glob
import io
import os
import re
import token
import tokenize

import parso

from .misc import MakeTextIO, first_non_none
from .typing import TYPE_CHECKING, cast

if TYPE_CHECKING:
    from .typing import Dict, Linesep, List, Literal, Optional, TextIO, Tuple, Union

PARSO_GRAMMAR_VERSIONS = []  # type: List[Tuple[int, int]]
for grammar_file in glob.iglob(os.path.join(parso.__path__[0], 'python', 'grammar*.txt')):  # type: ignore[attr-defined]
    grammar_version = os.path.basename(grammar_file)[7:-4]
    PARSO_GRAMMAR_VERSIONS.append((int(grammar_version[0]), int(grammar_version[1:])))
PARSO_GRAMMAR_VERSIONS = sorted(PARSO_GRAMMAR_VERSIONS)


[docs]def get_parso_grammar_versions(minimum: 'Optional[str]' = None) -> 'List[str]':
    """Get Python versions that parso supports to parse grammar.

    Args:
        minimum: filter result by this minimum version

    Returns:
        a list of Python versions that parso supports to parse grammar

    Raises:
        TypeError: if ``minimum`` is not :obj:`str`
        ValueError: if ``minimum`` is invalid

    """
    if minimum is None:
        minimum_tuple = ()  # type: Tuple[int, ...]
    else:
        if not isinstance(minimum, str):
            raise TypeError('minimum version should be a string')
        if not re.fullmatch(r'(?:0|[1-9][0-9]*)\.(?:0|[1-9][0-9]*)', minimum):
            raise ValueError('invalid minimum version')
        minimum_tuple = tuple(map(int, minimum.split('.')))
    return ['{}.{}'.format(*v) for v in PARSO_GRAMMAR_VERSIONS if v >= minimum_tuple]


[docs]class BPCSyntaxError(SyntaxError):
    """Syntax error detected when parsing code."""


[docs]def detect_encoding(code: bytes) -> str:
    """Detect encoding of Python source code as specified in :pep:`263`.

    Args:
        code: the code to detect encoding

    Returns:
        the detected encoding, or the default encoding (``utf-8``)

    Raises:
        TypeError: if ``code`` is not a :obj:`bytes` string
        SyntaxError: if both a BOM and a cookie are present, but disagree

    """
    if not isinstance(code, bytes):
        raise TypeError("'code' should be bytes")
    with io.BytesIO(code) as file:
        return tokenize.detect_encoding(file.readline)[0]


[docs]def detect_linesep(code: 'Union[str, bytes, TextIO, parso.tree.NodeOrLeaf]') -> 'Linesep':
    r"""Detect linesep of Python source code.

    Args:
        code: the code to detect linesep

    Returns:
        :data:`~bpc_utils.Linesep`: the detected linesep (one of ``'\n'``, ``'\r\n'`` and ``'\r'``)

    Notes:
        In case of mixed linesep, try voting by the number of occurrences of each linesep value.

        When there is a tie, prefer ``LF`` to ``CRLF``, prefer ``CRLF`` to ``CR``.

    """
    if isinstance(code, parso.tree.NodeOrLeaf):
        code = code.get_code()
    if isinstance(code, bytes):
        code = code.decode(detect_encoding(code))

    pool = {
        'CR': 0,
        'CRLF': 0,
        'LF': 0,
    }  # type: Dict[Literal['CR', 'CRLF', 'LF'], int]

    with MakeTextIO(cast('Union[str, TextIO]', code)) as file:
        for line in file:
            if line.endswith('\r'):
                pool['CR'] += 1
            elif line.endswith('\r\n'):
                pool['CRLF'] += 1
            elif line.endswith('\n'):
                pool['LF'] += 1

    # when there is a tie, prefer LF to CRLF, prefer CRLF to CR
    return cast('Linesep', max((pool['LF'], 3, '\n'), (pool['CRLF'], 2, '\r\n'), (pool['CR'], 1, '\r'))[2])


[docs]def detect_indentation(code: 'Union[str, bytes, TextIO, parso.tree.NodeOrLeaf]') -> str:
    """Detect indentation of Python source code.

    Args:
        code: the code to detect indentation

    Returns:
        the detected indentation sequence

    Raises:
        :exc:`~tokenize.TokenError`: when failed to tokenize the source code under certain cases,
            see documentation of :exc:`~tokenize.TokenError` for more details

    Notes:
        In case of mixed indentation, try voting by the number of occurrences of
        each indentation value (*spaces* and *tabs*).

        When there is a tie between *spaces* and *tabs*, prefer **4 spaces** for :pep:`8`.

    """
    if isinstance(code, parso.tree.NodeOrLeaf):
        code = code.get_code()
    if isinstance(code, bytes):
        code = code.decode(detect_encoding(code))

    pool = {
        'space': 0,
        'tab': 0
    }  # type: Dict[Literal['space', 'tab'], int]
    min_spaces = None  # type: Optional[int]

    with MakeTextIO(cast('Union[str, TextIO]', code)) as file:
        for token_info in tokenize.generate_tokens(file.readline):
            if token_info.type == token.INDENT:
                if '\t' in token_info.string and ' ' in token_info.string:
                    continue  # skip indentation with mixed spaces and tabs
                if '\t' in token_info.string:
                    pool['tab'] += 1
                else:
                    pool['space'] += 1
                    if min_spaces is None:
                        min_spaces = len(token_info.string)
                    else:
                        min_spaces = min(min_spaces, len(token_info.string))

    if pool['space'] > pool['tab']:
        return ' ' * cast(int, min_spaces)
    if pool['space'] < pool['tab']:
        return '\t'
    return ' ' * 4  # same number of spaces and tabs, prefer 4 spaces for PEP 8


[docs]def parso_parse(code: 'Union[str, bytes]', filename: 'Optional[str]' = None, *,
                version: 'Optional[str]' = None) -> 'parso.python.tree.Module':
    """Parse Python source code with parso.

    Args:
        code: the code to be parsed
        filename: an optional source file name to provide a context in case of error
        version: parse the code as this version (uses the latest version by default)

    Returns:
        parso AST

    Raises:
        BPCSyntaxError: when source code contains syntax errors

    """
    filename = first_non_none(filename, '<unknown>')
    grammar = parso.load_grammar(version=version if version is not None else get_parso_grammar_versions()[-1])
    if isinstance(code, bytes):
        try:
            code = code.decode(detect_encoding(code))
        except SyntaxError as e:
            raise BPCSyntaxError('failed to detect encoding for source file %r: %s' % (filename, e)) from None
    module = grammar.parse(code, error_recovery=True)  # type: parso.python.tree.Module
    errors = grammar.iter_errors(module)
    if errors:
        error_messages = '\n'.join('[L%dC%d] %s' % (error.start_pos + (error.message,)) for error in errors)
        raise BPCSyntaxError('source file %r contains the following syntax errors:\n%s' % (filename, error_messages))
    return module


__all__ = ['get_parso_grammar_versions', 'BPCSyntaxError', 'detect_encoding', 'detect_linesep', 'detect_indentation',
           'parso_parse']
Source code for bpc_utils.parsing

bpc-utils

Navigation

Related Topics