tvl-depot/users/wpcarro/scratch/facebook/parsing/regex.py

# Writing a small proof-of-concept...
#   - lexer
#   - parser
#   - compiler
# ...for regex.
#
# BNF
# expression -> ( char_class | CHAR ) quantifier? ( "|" expression )*
# char_class -> "[" CHAR+ "]"
# quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}"
#
# Of the numerous things I do not support, here are a few items of which I'm
# aware:
#   - alternatives:   (a|b)
#   - capture groups: (ab)cd

from parser import Parser
import string

################################################################################
# Top-Level API
################################################################################

def tokenize(xs):
    """
    Transform `xs` into a list of tokens.

    Also: expand shorthand symbols using the following table:
      - ? -> {0,1}
      - * -> {0,}
      - + -> {1,}
    """
    result = []
    i = 0
    shorthand = {
        "?": ["{", 0, ",", 1, "}"],
        "*": ["{", 0, ",", "}"],
        "+": ["{", 1, ",", "}"],
    }
    while i < len(xs):
        if xs[i] in shorthand:
            for c in shorthand[xs[i]]:
                result.append(c)
            i += 1
        elif xs[i] == "{":
            result.append(xs[i])
            i += 1
            curr = ""
            while xs[i] in string.digits:
                curr += xs[i]
                i += 1
            result.append(int(curr))
            assert xs[i] == ","
            result.append(",")
            i += 1
            curr = ""
            while xs[i] in string.digits:
                curr += xs[i]
                i += 1
            result.append(int(curr))
        else:
            result.append(xs[i])
            i += 1
    return result

def parse(expr):
    """
    Tokenize `expr` and convert it into a parse-tree.
    """
    tokens = tokenize(expr)
    return parse_tokens(tokens)

def compile(xs):
    """
    Transform `xs`, a parse-tree representing a regex, into a function that
    accepts a string, and returns the substring that the regex matches.
    """
    def fn(input):
        match = ""
        i = 0
        for x in xs:
            matches, q = x[1], x[2]
            lo, hi = q[1], q[2]
            for j in range(lo):
                if i < len(input) and input[i] in matches:
                    match += input[i]
                    i += 1
                else:
                    print("Failed to match {} with {}".format(input[i], matches))
                    return None
            if hi == float('inf'):
                while i < len(input) and input[i] in matches:
                    match += input[i]
                    i += 1
            else:
                for j in range(hi - lo):
                    if i < len(input) and input[i] in matches:
                        match += input[i]
                        i += 1
        return match
    return fn

################################################################################
# Helper Functions
################################################################################

def parse_tokens(tokens):
    result = []
    parser = Parser(tokens)
    while not parser.exhausted():
        result.append(parse_expression(parser))
    return result

def parse_expression(parser):
    if parser.curr() == "[":
        return parse_character_class(parser)
    else:
        return parse_character(parser)

def parse_character_class(parser):
    parser.expect("[")
    beg = parser.consume()
    parser.expect("-")
    end = parser.consume()
    parser.expect("]")
    if parser.curr() == "{":
        q = parse_quantifier(parser)
    return char_class(xs=expand_range(beg, end), q=q)

def parse_quantifier(parser):
    parser.expect("{")
    if parser.match([","]):
        end = parser.consume()
        parser.expect("}")
        return quantifier(beg=0, end=end)
    else:
        beg = parser.consume()
        parser.expect(",")
        if parser.match(["}"]):
            return quantifier(beg=beg)
        else:
            end = parser.consume()
            parser.expect("}")
            return quantifier(beg=beg, end=end)

def parse_character(parser):
    c = parser.consume()
    q = None
    if parser.curr() == "{":
        q = parse_quantifier(parser)
    return char_class(xs={c}, q=q)

def char_class(xs=set(), q=None):
    if not q:
        q = quantifier(beg=1, end=1)
    return ["CHARACTER_CLASS", xs, q]

def expand_range(beg, end):
    # TODO: Implement this
    return {string.printable[i]
            for i in range(string.printable.index(beg),
                           string.printable.index(end) + 1)}

def quantifier(beg=0, end=float('inf')):
    return ['QUANTIFIER', beg, end]

################################################################################
# Tests
################################################################################

xs = [
    ("[a-c]*[0-9]{2,3}", ["dog"]),
    ("ca+t?", ["cat", "caaaat", "ca", "dog"]),
]

for re, inputs in xs:
    print("Regex:  {}".format(re))
    print("Tokens: {}".format(tokenize(re)))
    print("Parsed: {}".format(parse(re)))
    print("\nTESTS")
    for input in inputs:
        print("Attempting to match \"{}\"...".format(input))
        parser = compile(parse(re))
        print("Result: \"{}\"\n".format(parser(input)))