feat(wpcarro/simple-select): support tokenizer for query language

Support a tokenizer for a query language that looks like:

```
-fname:/W.*m/ lname:"Von Carroll"
```

Parser otw...

Change-Id: I2badf14a41313ca2f75dec20adbcf9031b22ab83
Reviewed-on: https://cl.tvl.fyi/c/depot/+/5338
Reviewed-by: wpcarro <wpcarro@gmail.com>
Autosubmit: wpcarro <wpcarro@gmail.com>
Tested-by: BuildkiteCI
This commit is contained in:
William Carroll 2022-02-28 11:59:55 -08:00 committed by clbot
parent 7770ccf0e3
commit 88a3051ae5
2 changed files with 88 additions and 5 deletions

View file

@ -1,8 +1,91 @@
import string
from scanner import Scanner
################################################################################
# Predicates
################################################################################
def is_alpha(c):
return c in string.ascii_letters
def is_digit(c):
return c in "0123456789"
def is_alphanumeric(c):
return is_alpha(c) or is_digit(c)
def is_whitespace(c):
return c in " \r\t\n"
################################################################################
# Tokenizer
################################################################################
def tokenize(x):
s = Scanner(x)
return None
tokens = scan_tokens(s)
return tokens
def scan_tokens(s):
result = []
while not s.exhausted():
if is_whitespace(s.peek()):
s.advance()
else:
result.append(scan_token(s))
return result
def scan_token(s):
punctuation = {
"-": "NOT",
":": "COLON",
}
c = s.peek()
if c in punctuation:
s.advance()
return punctuation[c]
if c == "\"":
return tokenize_string(s)
if c == "/":
return tokenize_regex(s)
if is_alpha(c):
return tokenize_identifier(s)
def tokenize_string(s):
s.advance() # ignore opening 2x-quote
current = ""
while s.peek() != "\"" and not s.exhausted():
current += s.advance()
if s.exhausted():
raise Exception("Unterminated string")
s.advance() # ignore closing 2x-quote
return ("STRING", current)
def tokenize_regex(s):
s.advance() # ignore opening forward-slash
current = ""
while s.peek() != "/" and not s.exhausted():
current += s.advance()
if s.exhausted():
raise Exception("Unterminated regex")
s.advance() # ignore closing forward-slash
return ("REGEX", current)
def tokenize_identifier(s):
keywords = {
"AND",
"OR",
}
current = s.advance()
while is_alphanumeric(s.peek()):
current += s.advance()
if current.upper() in keywords:
return ("KEYWORD", current.upper())
else:
return ("IDENTIFIER", current)
################################################################################
# Main
################################################################################
def main():
while True:

View file

@ -2,15 +2,15 @@
# scanner/lexer needs are peek and advance; other functions (e.g. match) are
# nice-to-haves.
class Scanner(object):
def __init__(self, source):
def __init__(self, chars):
self.i = 0
self.source = source
self.chars = chars
def exhausted(self):
return self.i >= len(self.source)
return self.i >= len(self.chars)
def peek(self, n=0):
return self.source[self.i + n] if self.i + n < len(self.source) else '\0'
return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0'
def advance(self):
result = self.peek()