在 PyQt 中實現基於正規表達式的 Syntax Highlighter

因為需要用Qt來顯示程式碼,所以就基於Regular Expression方式,實作了一個簡易的Syntax Highlighting。並且允許客製化顏色主題。

範例程式碼

為了減少程式碼的重複,將必要的部分擷取出來寫成一個 AbsHighlighter,後面實作的 Syntax Highlighting Class 都會繼承這個類別。預設顏色寫在 THEMES["Default"]

THEMES = {
    "Default": {
        "constant": format("coral"),
        "keyword": format("blue"),
        "builtin-func": format("darkCyan"),
        "builtin-type": format("darkCyan"),
        "operator": format("dimGray"),
        "brace": format("darkGray"),
        "function": format("black", "bold"),
        "class": format("black", "bold"),
        "string": format("orchid"),
        "string2": format("darkMagenta"),
        "method": format("fireBrick"),
        "comment": format("darkGreen", "italic"),
        "self": format("black", "italic"),
        "numbers": format("darkOrange"),
    },
    # ...
}

class AbsHighlighter(QSyntaxHighlighter):
    keywords = []
    funcs = []
    types = []
    operators = []
    braces = [r"\{", r"\}", r"\(", r"\)", r"\[", r"\]"]
    builtin_values = []

    def __init__(self, document, style):
        QSyntaxHighlighter.__init__(self, document)

        rules = []
        # Keyword, operator, and brace rules
        rules += [(50, r"\b%s\b" % w, 0, style["keyword"]) for w in self.keywords]
        rules += [(50, r"\b%s\b" % w, 0, style["numbers"]) for w in self.builtin_values]
        rules += [(50, r"\b%s\b" % o, 0, style["builtin-func"]) for o in self.funcs]
        rules += [(50, r"\b%s\b" % o, 0, style["builtin-type"]) for o in self.types]
        rules += [(50, r"%s" % o, 0, style["operator"]) for o in self.operators]
        rules += [(50, r"%s" % b, 0, style["brace"]) for b in self.braces]
        # Numeric literals
        rules += [(50, r"\b[+-]?[0-9]+[lL]?\b", 0, style["numbers"])]
        rules += [(50, r"\b[+-]?0[xX][0-9A-Fa-f]+[lL]?\b", 0, style["numbers"])]
        rules += [
            (50, r"\b[+-]?[0-9]+(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?\b", 0, style["numbers"])
        ]

        # Build a QRegExp for each pattern
        # larger order will be applied later
        self.rules = [(order, QRegExp(pat), index, fmt) for (order, pat, index, fmt) in rules]

        self.style = style
        self.post_init()

        self.rules.sort(key=lambda x: x[0])

    def post_init(self):
        pass

    def highlightBlock(self, text):
        """Apply syntax highlighting to the given block of text."""
        # Do other syntax formatting
        self.pre_highlight(text)

        for _, expression, nth, format in self.rules:
            index = expression.indexIn(text, 0)

            while index >= 0:
                # We actually want the index of the nth match
                index = expression.pos(nth)
                length = len(expression.cap(nth))
                self.setFormat(index, length, format)
                index = expression.indexIn(text, index + length)

        self.post_highlight(text)

    def pre_highlight(self, text):
        pass

    def post_highlight(self, text):
        pass

Python Syntax Highlighter

來練習使用 AbsHighlighter 寫一個 Python Syntax Highlighter。

class PythonHighlighter(AbsHighlighter):
    """Syntax highlighter for the Python language."""

    # fmt: off
    keywords = [
        "assert", "and", "break", "class", "continue", "def", "del", "elif",
        "else", "except", "exec", "finally", "for", "from", "global", "if", "import",
        "in", "is", "not", "or", "pass", "raise", "return", "try",
        "lambda", "while", "yield",
    ]
    funcs = [
        "abs", "aiter", "all", "any", "anext", "ascii", "bin", "bool", "breakpoint",
        "callable", "chr", "classmethod", "compile", "complex", "delattr",
        "dir", "divmod", "enumerate", "eval", "exec", "filter", "float", "format",
        "frozenset", "getattr", "globals", "hasattr", "hash", "help", "hex", "id",
        "input", "isinstance", "issubclass", "iter", "len", "locals",
        "max", "memoryview", "min", "next", "object", "oct", "open", "ord",
        "pow", "print", "property", "range", "repr", "reversed", "round",
        "setattr", "slice", "sorted", "staticmethod", "sum", "super", "type", "vars", "zip",
    ]
    types = [
        "bytes", "bytearray", "int", "str", "list",
        "map", "dict", "set", "tuple",
    ]
    operators = [
        "=",
        # Comparison
        "==", "!=", "<", "<=", ">", ">=",
        # Arithmetic
        r"\+", "-", r"\*", "/", "//", r"\%", r"\*\*",
        # Bitwise
        r"\^", r"\|", r"\&", r"\~", ">>", "<<", # In-place
        r"\+=", "-=", r"\*=", "/=", r"\%=", r"\^=", r"\|=", r"\&=", r"\~=", ">>=", "<<=",
    ]
    builtin_values = [
        "None", "True", "False",
    ]
    # fmt: on

    def post_init(self):
        self.rules += [
            # decorater
            (50, QRegExp(r"^\s*(@).+$"), 1, self.style["keyword"]),
            # 'self'
            (50, QRegExp(r"\bself\b"), 0, self.style["self"]),
            # dot operation method
            (50, QRegExp(r"\.(\w+)\(.*\)"), 1, self.style["method"]),
            # 'def' followed by an identifier
            (50, QRegExp(r"\bdef\b\s*(\w+)"), 1, self.style["function"]),
            # 'class' followed by an identifier
            (50, QRegExp(r"\bclass\b\s*(\w+)"), 1, self.style["class"]),
            (40, QRegExp(r"\b([A-Z]\w+)"), 1, self.style["class"]),
            # 'constant'
            (50, QRegExp(r"\b[A-Z][A-Z_]*\b"), 0, self.style["constant"]),
            # From '#' until a newline, # at the end
            (60, QRegExp(r"[^\'\"=\w\[\]\{\}]#[^\n]*"), 0, self.style["comment"]),
            # Single-quoted string, possibly containing escape sequences
            (50, QRegExp(r"'[^'\\]*(\\.[^'\\]*)*'"), 0, self.style["string"]),
            # Double-quoted string, possibly containing escape sequences
            (50, QRegExp(r'"[^"\\]*(\\.[^"\\]*)*"'), 0, self.style["string"]),
            # From '#' until a newline, # at begining
            (60, QRegExp(r"^#[^\n]*"), 0, self.style["comment"]),
        ]
        # Multi-line strings (expression, flag, self.style)
        # FIXME: The triple-quotes in these two lines will mess up the
        # syntax highlighting from this point onward
        self.tri_single = (QRegExp("'''"), 1, self.style["string2"])
        self.tri_double = (QRegExp('"""'), 2, self.style["string2"])

    def post_highlight(self, text):
        self.setCurrentBlockState(0)
        # Do multi-line strings
        in_multiline = self.match_multiline(text, *self.tri_single)
        if not in_multiline:
            in_multiline = self.match_multiline(text, *self.tri_double)

    def match_multiline(self, text, delimiter, in_state, style):
        """Do highlighting of multi-line strings. ``delimiter`` should be a
        ``QRegExp`` for triple-single-quotes or triple-double-quotes, and
        ``in_state`` should be a unique integer to represent the corresponding
        state changes when inside those strings. Returns True if we're still
        inside a multi-line string when this function is finished.
        """
        # If inside triple-single quotes, start at 0
        if self.previousBlockState() == in_state:
            start = 0
            add = 0
        # Otherwise, look for the delimiter on this line
        else:
            start = delimiter.indexIn(text)
            # Move past this match
            add = delimiter.matchedLength()

        # As long as there's a delimiter match on this line...
        while start >= 0:
            # Look for the ending delimiter
            end = delimiter.indexIn(text, start + add)
            # Ending delimiter on this line?
            if end >= add:
                length = end - start + add + delimiter.matchedLength()
                self.setCurrentBlockState(0)
            # No; multi-line string
            else:
                self.setCurrentBlockState(in_state)
                length = len(text) - start + add
            # Apply formatting
            self.setFormat(start, length, style)
            # Look for the next match
            start = delimiter.indexIn(text, start + length)

        # Return True if still inside a multi-line string, False otherwise
        if self.currentBlockState() == in_state:
            return True
        else:
            return False

使用方法

# get your plaintext editor
editor = QtGui.QPlainTextEdit()

# then apply our syntax highlighter
highlight = PythonHighlighter(editor.document())

使用正規表達式,並不能正確地標示出所有程式語法,但對於簡易文字編輯器而言,已足夠。未來有機會可以再研究一下要怎麼搭配 Treesitter。

References