feat(wpcarro/simple-select): support tokenizer for query language
Support a tokenizer for a query language that looks like: ``` -fname:/W.*m/ lname:"Von Carroll" ``` Parser otw... Change-Id: I2badf14a41313ca2f75dec20adbcf9031b22ab83 Reviewed-on: https://cl.tvl.fyi/c/depot/+/5338 Reviewed-by: wpcarro <wpcarro@gmail.com> Autosubmit: wpcarro <wpcarro@gmail.com> Tested-by: BuildkiteCI
This commit is contained in:
parent
7770ccf0e3
commit
88a3051ae5
2 changed files with 88 additions and 5 deletions
|
@ -1,8 +1,91 @@
|
|||
import string
|
||||
from scanner import Scanner
|
||||
################################################################################
|
||||
# Predicates
|
||||
################################################################################
|
||||
|
||||
def is_alpha(c):
|
||||
return c in string.ascii_letters
|
||||
|
||||
def is_digit(c):
|
||||
return c in "0123456789"
|
||||
|
||||
def is_alphanumeric(c):
|
||||
return is_alpha(c) or is_digit(c)
|
||||
|
||||
def is_whitespace(c):
|
||||
return c in " \r\t\n"
|
||||
|
||||
################################################################################
|
||||
# Tokenizer
|
||||
################################################################################
|
||||
|
||||
def tokenize(x):
|
||||
s = Scanner(x)
|
||||
return None
|
||||
tokens = scan_tokens(s)
|
||||
return tokens
|
||||
|
||||
def scan_tokens(s):
|
||||
result = []
|
||||
while not s.exhausted():
|
||||
if is_whitespace(s.peek()):
|
||||
s.advance()
|
||||
else:
|
||||
result.append(scan_token(s))
|
||||
return result
|
||||
|
||||
def scan_token(s):
|
||||
punctuation = {
|
||||
"-": "NOT",
|
||||
":": "COLON",
|
||||
}
|
||||
c = s.peek()
|
||||
if c in punctuation:
|
||||
s.advance()
|
||||
return punctuation[c]
|
||||
if c == "\"":
|
||||
return tokenize_string(s)
|
||||
if c == "/":
|
||||
return tokenize_regex(s)
|
||||
if is_alpha(c):
|
||||
return tokenize_identifier(s)
|
||||
|
||||
def tokenize_string(s):
|
||||
s.advance() # ignore opening 2x-quote
|
||||
current = ""
|
||||
while s.peek() != "\"" and not s.exhausted():
|
||||
current += s.advance()
|
||||
if s.exhausted():
|
||||
raise Exception("Unterminated string")
|
||||
s.advance() # ignore closing 2x-quote
|
||||
return ("STRING", current)
|
||||
|
||||
def tokenize_regex(s):
|
||||
s.advance() # ignore opening forward-slash
|
||||
current = ""
|
||||
while s.peek() != "/" and not s.exhausted():
|
||||
current += s.advance()
|
||||
if s.exhausted():
|
||||
raise Exception("Unterminated regex")
|
||||
s.advance() # ignore closing forward-slash
|
||||
return ("REGEX", current)
|
||||
|
||||
def tokenize_identifier(s):
|
||||
keywords = {
|
||||
"AND",
|
||||
"OR",
|
||||
}
|
||||
current = s.advance()
|
||||
while is_alphanumeric(s.peek()):
|
||||
current += s.advance()
|
||||
if current.upper() in keywords:
|
||||
return ("KEYWORD", current.upper())
|
||||
else:
|
||||
return ("IDENTIFIER", current)
|
||||
|
||||
################################################################################
|
||||
# Main
|
||||
################################################################################
|
||||
|
||||
def main():
|
||||
while True:
|
||||
|
|
|
@ -2,15 +2,15 @@
|
|||
# scanner/lexer needs are peek and advance; other functions (e.g. match) are
|
||||
# nice-to-haves.
|
||||
class Scanner(object):
|
||||
def __init__(self, source):
|
||||
def __init__(self, chars):
|
||||
self.i = 0
|
||||
self.source = source
|
||||
self.chars = chars
|
||||
|
||||
def exhausted(self):
|
||||
return self.i >= len(self.source)
|
||||
return self.i >= len(self.chars)
|
||||
|
||||
def peek(self, n=0):
|
||||
return self.source[self.i + n] if self.i + n < len(self.source) else '\0'
|
||||
return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0'
|
||||
|
||||
def advance(self):
|
||||
result = self.peek()
|
||||
|
|
Loading…
Reference in a new issue