Source code for sphinx_gallery.py_source_parser

r"""Parser for python source files."""

# Created Sun Nov 27 14:03:07 2016
# Author: Óscar Nájera

import ast
import re
import tokenize
from collections import namedtuple
from io import BytesIO
from pathlib import Path
from textwrap import dedent
from typing import Any, Literal, overload

from sphinx.errors import ExtensionError
from sphinx.util.logging import getLogger

logger = getLogger("sphinx-gallery")

SYNTAX_ERROR_DOCSTRING = """
SyntaxError
===========

Example script with invalid Python syntax
"""

# The pattern for in-file config comments is designed to not greedily match
# newlines at the start and end, except for one newline at the end. This
# ensures that the matched pattern can be removed from the code without
# changing the block structure; i.e. empty newlines are preserved, e.g. in
#
#     a = 1
#
#     # sphinx_gallery_thumbnail_number = 2
#
#     b = 2
FLAG_START = r"^[\ \t]*#\s*"
FLAG_BODY = r"sphinx_gallery_([A-Za-z0-9_]+)(\s*=\s*(.+))?[\ \t]*\n?"
INFILE_CONFIG_PATTERN = re.compile(FLAG_START + FLAG_BODY, re.MULTILINE)

START_IGNORE_FLAG = FLAG_START + "sphinx_gallery_start_ignore"
END_IGNORE_FLAG = FLAG_START + "sphinx_gallery_end_ignore"
IGNORE_BLOCK_PATTERN = re.compile(
    rf"{START_IGNORE_FLAG}(?:[\s\S]*?){END_IGNORE_FLAG}\n?", re.MULTILINE
)



[docs]
def parse_source_file(filename: str | Path) -> tuple[ast.Module | None, str]:
    """Parse source file into AST node.

    Parameters
    ----------
    filename : str or Path
        File path

    Returns
    -------
    node : AST node
    content : utf-8 encoded string
    """
    # read_text automatically converts \r\n to \n
    content = Path(filename).read_text(encoding="utf-8")

    try:
        node = ast.parse(content)
        return node, content
    except SyntaxError:
        return None, content



def _get_docstring_and_rest(filename: Path) -> tuple[str, str, int, ast.Module | None]:
    """Separate ``filename`` content between docstring and the rest.

    Strongly inspired from ast.get_docstring.

    Returns
    -------
    docstring : str
        docstring of ``filename``
    rest : str
        ``filename`` content without the docstring
    lineno : int
        The line number.
    node : ast.Module
        The ast node. When `filename` parsed with `mode='exec'` node should be
        of type `ast.Module`.
    """
    node, content = parse_source_file(filename)

    if node is None:
        return SYNTAX_ERROR_DOCSTRING, content, 1, node

    if not isinstance(node, ast.Module):
        raise ExtensionError(
            "This function only supports modules. You provided {}".format(
                node.__class__.__name__
            )
        )
    if not (
        node.body
        and isinstance(node.body[0], ast.Expr)
        and isinstance(node.body[0].value, ast.Constant)
    ):
        raise ExtensionError(
            'Could not find docstring in file "{}". '
            "A docstring is required by sphinx-gallery "
            'unless the file is ignored by "ignore_pattern"'.format(filename)
        )

    # Python 3.7+ way
    docstring = ast.get_docstring(node)
    assert docstring is not None  # should be guaranteed above
    # This is just for backward compat
    assert isinstance(node.body[0].value.value, str)
    if node.body[0].value.value[:1] == "\n":
        # just for strict backward compat here
        docstring = "\n" + docstring
    ts = tokenize.tokenize(BytesIO(content.encode()).readline)
    # find the first string according to the tokenizer and get its end row
    for tk in ts:
        if tk.exact_type == 3:
            lineno, _ = tk.end
            break
    else:
        lineno = 0

    # This get the content of the file after the docstring last line
    # Note: 'maxsplit' argument is not a keyword argument in python2
    rest = "\n".join(content.split("\n")[lineno:])
    lineno += 1
    return docstring, rest, lineno, node



[docs]
def extract_file_config(content: str) -> dict[str, Any]:
    """Pull out the file-specific config specified in the docstring."""
    file_conf = {}
    for match in re.finditer(INFILE_CONFIG_PATTERN, content):
        name = match.group(1)
        value = match.group(3)
        if value is None:  # a flag rather than a config setting
            continue
        try:
            value = ast.literal_eval(value)
        except (SyntaxError, ValueError):
            logger.warning(
                "Sphinx-gallery option %s was passed invalid value %s", name, value
            )
        else:
            file_conf[name] = value
    return file_conf



Block = namedtuple("Block", ["type", "content", "lineno"])
# Double comments to make mypy not error out on this notation
## type: "text" or "code"
## content (str): the block lines as str
## lineno (int): the line number where the block starts


@overload
def split_code_and_text_blocks(
    source_file: str | Path,
    return_node: Literal[True],
) -> tuple[dict[str, Any], list, ast.Module | None]: ...


@overload
def split_code_and_text_blocks(
    source_file: str | Path,
    return_node: Literal[False] = False,
) -> tuple[dict[str, Any], list]: ...



[docs]
def split_code_and_text_blocks(source_file: str | Path, return_node: bool = False):
    """Return list with source file separated into code and text blocks.

    Parameters
    ----------
    source_file : str or Path
        Path to the source file.
    return_node : bool
        If True, return the ast node.

    Returns
    -------
    file_conf : dict
        File-specific settings given in source file comments as:
        ``# sphinx_gallery_<name> = <value>``
    blocks : list
        (label, content, line_number)
        List where each element is a tuple with the label ('text' or 'code'),
        the corresponding content string of block and the leading line number.
    node : ast.Module
        The parsed ast node.
    """
    docstring, rest_of_content, lineno, node = _get_docstring_and_rest(
        Path(source_file)
    )
    blocks = [Block("text", docstring, 1)]

    file_conf = extract_file_config(rest_of_content)

    pattern = re.compile(
        r"(?P<header_line>^#{20,}.*|^# ?%%.*)\s(?P<text_content>(?:^#.*\s?)*)",
        flags=re.M,
    )
    sub_pat = re.compile("^#", flags=re.M)

    pos_so_far = 0
    for match in re.finditer(pattern, rest_of_content):
        code_block_content = rest_of_content[pos_so_far : match.start()]
        if code_block_content.strip():
            blocks.append(Block("code", code_block_content, lineno))
        lineno += code_block_content.count("\n")

        lineno += 1  # Ignored header line of hashes.
        text_content = match.group("text_content")
        text_block_content = dedent(re.sub(sub_pat, "", text_content)).lstrip()
        if text_block_content.strip():
            blocks.append(Block("text", text_block_content, lineno))
        lineno += text_content.count("\n")

        pos_so_far = match.end()

    remaining_content = rest_of_content[pos_so_far:]
    if remaining_content.strip():
        blocks.append(Block("code", remaining_content, lineno))

    if return_node:
        return file_conf, blocks, node
    else:
        return file_conf, blocks




[docs]
def remove_ignore_blocks(code_block: str) -> str:
    """
    Return the content of *code_block* with ignored areas removed.

    An ignore block starts with # sphinx_gallery_start_ignore, and ends with
    # sphinx_gallery_end_ignore. These lines and anything in between them will
    be removed, but surrounding empty lines are preserved.

    Parameters
    ----------
    code_block : str
        A code segment.
    """
    num_start_flags = len(re.findall(START_IGNORE_FLAG, code_block, re.MULTILINE))
    num_end_flags = len(re.findall(END_IGNORE_FLAG, code_block, re.MULTILINE))

    if num_start_flags != num_end_flags:
        raise ExtensionError(
            'All "sphinx_gallery_start_ignore" flags must have a matching '
            '"sphinx_gallery_end_ignore" flag!'
        )
    return re.subn(IGNORE_BLOCK_PATTERN, "", code_block)[0]




[docs]
def remove_config_comments(code_block: str) -> str:
    """
    Return the content of *code_block* with in-file config comments removed.

    Comment lines of the pattern '# sphinx_gallery_[option] = [val]' are
    removed, but surrounding empty lines are preserved.

    Parameters
    ----------
    code_block : str
        A code segment.
    """
    parsed_code, _ = re.subn(INFILE_CONFIG_PATTERN, "", code_block)
    return parsed_code