r"""Parser for python source files."""
# Created Sun Nov 27 14:03:07 2016
# Author: Óscar Nájera
import codecs
import ast
from io import BytesIO
import re
import tokenize
from textwrap import dedent
from sphinx.errors import ExtensionError
from sphinx.util.logging import getLogger
logger = getLogger("sphinx-gallery")
SYNTAX_ERROR_DOCSTRING = """
SyntaxError
===========
Example script with invalid Python syntax
"""
# The pattern for in-file config comments is designed to not greedily match
# newlines at the start and end, except for one newline at the end. This
# ensures that the matched pattern can be removed from the code without
# changing the block structure; i.e. empty newlines are preserved, e.g. in
#
# a = 1
#
# # sphinx_gallery_thumbnail_number = 2
#
# b = 2
FLAG_START = r"^[\ \t]*#\s*"
FLAG_BODY = r"sphinx_gallery_([A-Za-z0-9_]+)(\s*=\s*(.+))?[\ \t]*\n?"
INFILE_CONFIG_PATTERN = re.compile(FLAG_START + FLAG_BODY, re.MULTILINE)
START_IGNORE_FLAG = FLAG_START + "sphinx_gallery_start_ignore"
END_IGNORE_FLAG = FLAG_START + "sphinx_gallery_end_ignore"
IGNORE_BLOCK_PATTERN = re.compile(
rf"{START_IGNORE_FLAG}(?:[\s\S]*?){END_IGNORE_FLAG}\n?", re.MULTILINE
)
[docs]
def parse_source_file(filename):
"""Parse source file into AST node.
Parameters
----------
filename : str
File path
Returns
-------
node : AST node
content : utf-8 encoded string
"""
with codecs.open(filename, "r", "utf-8") as fid:
content = fid.read()
# change from Windows format to UNIX for uniformity
content = content.replace("\r\n", "\n")
try:
node = ast.parse(content)
return node, content
except SyntaxError:
return None, content
def _get_docstring_and_rest(filename):
"""Separate ``filename`` content between docstring and the rest.
Strongly inspired from ast.get_docstring.
Returns
-------
docstring : str
docstring of ``filename``
rest : str
``filename`` content without the docstring
lineno : int
The line number.
node : ast.Module
The ast node. When `filename` parsed with `mode='exec'` node should be
of type `ast.Module`.
"""
node, content = parse_source_file(filename)
if node is None:
return SYNTAX_ERROR_DOCSTRING, content, 1, node
if not isinstance(node, ast.Module):
raise ExtensionError(
"This function only supports modules. "
"You provided {}".format(node.__class__.__name__)
)
if not (
node.body
and isinstance(node.body[0], ast.Expr)
and isinstance(node.body[0].value, ast.Str)
):
raise ExtensionError(
'Could not find docstring in file "{}". '
"A docstring is required by sphinx-gallery "
'unless the file is ignored by "ignore_pattern"'.format(filename)
)
# Python 3.7+ way
docstring = ast.get_docstring(node)
assert docstring is not None # should be guaranteed above
# This is just for backward compat
if len(node.body[0].value.s) and node.body[0].value.s[0] == "\n":
# just for strict backward compat here
docstring = "\n" + docstring
ts = tokenize.tokenize(BytesIO(content.encode()).readline)
# find the first string according to the tokenizer and get its end row
for tk in ts:
if tk.exact_type == 3:
lineno, _ = tk.end
break
else:
lineno = 0
# This get the content of the file after the docstring last line
# Note: 'maxsplit' argument is not a keyword argument in python2
rest = "\n".join(content.split("\n")[lineno:])
lineno += 1
return docstring, rest, lineno, node
[docs]
def split_code_and_text_blocks(source_file, return_node=False):
"""Return list with source file separated into code and text blocks.
Parameters
----------
source_file : str
Path to the source file.
return_node : bool
If True, return the ast node.
Returns
-------
file_conf : dict
File-specific settings given in source file comments as:
``# sphinx_gallery_<name> = <value>``
blocks : list
(label, content, line_number)
List where each element is a tuple with the label ('text' or 'code'),
the corresponding content string of block and the leading line number.
node : ast.Module
The parsed ast node.
"""
docstring, rest_of_content, lineno, node = _get_docstring_and_rest(source_file)
blocks = [("text", docstring, 1)]
file_conf = extract_file_config(rest_of_content)
pattern = re.compile(
r"(?P<header_line>^#{20,}.*|^# ?%%.*)\s(?P<text_content>(?:^#.*\s?)*)",
flags=re.M,
)
sub_pat = re.compile("^#", flags=re.M)
pos_so_far = 0
for match in re.finditer(pattern, rest_of_content):
code_block_content = rest_of_content[pos_so_far : match.start()]
if code_block_content.strip():
blocks.append(("code", code_block_content, lineno))
lineno += code_block_content.count("\n")
lineno += 1 # Ignored header line of hashes.
text_content = match.group("text_content")
text_block_content = dedent(re.sub(sub_pat, "", text_content)).lstrip()
if text_block_content.strip():
blocks.append(("text", text_block_content, lineno))
lineno += text_content.count("\n")
pos_so_far = match.end()
remaining_content = rest_of_content[pos_so_far:]
if remaining_content.strip():
blocks.append(("code", remaining_content, lineno))
out = (file_conf, blocks)
if return_node:
out += (node,)
return out
[docs]
def remove_ignore_blocks(code_block):
"""
Return the content of *code_block* with ignored areas removed.
An ignore block starts with # sphinx_gallery_start_ignore, and ends with
# sphinx_gallery_end_ignore. These lines and anything in between them will
be removed, but surrounding empty lines are preserved.
Parameters
----------
code_block : str
A code segment.
"""
num_start_flags = len(re.findall(START_IGNORE_FLAG, code_block, re.MULTILINE))
num_end_flags = len(re.findall(END_IGNORE_FLAG, code_block, re.MULTILINE))
if num_start_flags != num_end_flags:
raise ExtensionError(
'All "sphinx_gallery_start_ignore" flags must have a matching '
'"sphinx_gallery_end_ignore" flag!'
)
return re.subn(IGNORE_BLOCK_PATTERN, "", code_block)[0]