From 88a3051ae5a0b34670a5f43c6fa30a55ad76f8f1 Mon Sep 17 00:00:00 2001 From: William Carroll Date: Mon, 28 Feb 2022 11:59:55 -0800 Subject: [PATCH] feat(wpcarro/simple-select): support tokenizer for query language Support a tokenizer for a query language that looks like: ``` -fname:/W.*m/ lname:"Von Carroll" ``` Parser otw... Change-Id: I2badf14a41313ca2f75dec20adbcf9031b22ab83 Reviewed-on: https://cl.tvl.fyi/c/depot/+/5338 Reviewed-by: wpcarro Autosubmit: wpcarro Tested-by: BuildkiteCI --- users/wpcarro/scratch/simple-select/main.py | 85 ++++++++++++++++++- .../wpcarro/scratch/simple-select/scanner.py | 8 +- 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py index 6a86324ef..0aea8dcff 100644 --- a/users/wpcarro/scratch/simple-select/main.py +++ b/users/wpcarro/scratch/simple-select/main.py @@ -1,8 +1,91 @@ +import string from scanner import Scanner +################################################################################ +# Predicates +################################################################################ + +def is_alpha(c): + return c in string.ascii_letters + +def is_digit(c): + return c in "0123456789" + +def is_alphanumeric(c): + return is_alpha(c) or is_digit(c) + +def is_whitespace(c): + return c in " \r\t\n" + +################################################################################ +# Tokenizer +################################################################################ def tokenize(x): s = Scanner(x) - return None + tokens = scan_tokens(s) + return tokens + +def scan_tokens(s): + result = [] + while not s.exhausted(): + if is_whitespace(s.peek()): + s.advance() + else: + result.append(scan_token(s)) + return result + +def scan_token(s): + punctuation = { + "-": "NOT", + ":": "COLON", + } + c = s.peek() + if c in punctuation: + s.advance() + return punctuation[c] + if c == "\"": + return tokenize_string(s) + if c == "/": + return tokenize_regex(s) + if is_alpha(c): + return tokenize_identifier(s) + +def tokenize_string(s): + s.advance() # ignore opening 2x-quote + current = "" + while s.peek() != "\"" and not s.exhausted(): + current += s.advance() + if s.exhausted(): + raise Exception("Unterminated string") + s.advance() # ignore closing 2x-quote + return ("STRING", current) + +def tokenize_regex(s): + s.advance() # ignore opening forward-slash + current = "" + while s.peek() != "/" and not s.exhausted(): + current += s.advance() + if s.exhausted(): + raise Exception("Unterminated regex") + s.advance() # ignore closing forward-slash + return ("REGEX", current) + +def tokenize_identifier(s): + keywords = { + "AND", + "OR", + } + current = s.advance() + while is_alphanumeric(s.peek()): + current += s.advance() + if current.upper() in keywords: + return ("KEYWORD", current.upper()) + else: + return ("IDENTIFIER", current) + +################################################################################ +# Main +################################################################################ def main(): while True: diff --git a/users/wpcarro/scratch/simple-select/scanner.py b/users/wpcarro/scratch/simple-select/scanner.py index 96704ec1a..5dae68aee 100644 --- a/users/wpcarro/scratch/simple-select/scanner.py +++ b/users/wpcarro/scratch/simple-select/scanner.py @@ -2,15 +2,15 @@ # scanner/lexer needs are peek and advance; other functions (e.g. match) are # nice-to-haves. class Scanner(object): - def __init__(self, source): + def __init__(self, chars): self.i = 0 - self.source = source + self.chars = chars def exhausted(self): - return self.i >= len(self.source) + return self.i >= len(self.chars) def peek(self, n=0): - return self.source[self.i + n] if self.i + n < len(self.source) else '\0' + return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0' def advance(self): result = self.peek()