88a3051ae5
Support a tokenizer for a query language that looks like: ``` -fname:/W.*m/ lname:"Von Carroll" ``` Parser otw... Change-Id: I2badf14a41313ca2f75dec20adbcf9031b22ab83 Reviewed-on: https://cl.tvl.fyi/c/depot/+/5338 Reviewed-by: wpcarro <wpcarro@gmail.com> Autosubmit: wpcarro <wpcarro@gmail.com> Tested-by: BuildkiteCI
96 lines
2.2 KiB
Python
96 lines
2.2 KiB
Python
import string
|
|
from scanner import Scanner
|
|
################################################################################
|
|
# Predicates
|
|
################################################################################
|
|
|
|
def is_alpha(c):
|
|
return c in string.ascii_letters
|
|
|
|
def is_digit(c):
|
|
return c in "0123456789"
|
|
|
|
def is_alphanumeric(c):
|
|
return is_alpha(c) or is_digit(c)
|
|
|
|
def is_whitespace(c):
|
|
return c in " \r\t\n"
|
|
|
|
################################################################################
|
|
# Tokenizer
|
|
################################################################################
|
|
|
|
def tokenize(x):
|
|
s = Scanner(x)
|
|
tokens = scan_tokens(s)
|
|
return tokens
|
|
|
|
def scan_tokens(s):
|
|
result = []
|
|
while not s.exhausted():
|
|
if is_whitespace(s.peek()):
|
|
s.advance()
|
|
else:
|
|
result.append(scan_token(s))
|
|
return result
|
|
|
|
def scan_token(s):
|
|
punctuation = {
|
|
"-": "NOT",
|
|
":": "COLON",
|
|
}
|
|
c = s.peek()
|
|
if c in punctuation:
|
|
s.advance()
|
|
return punctuation[c]
|
|
if c == "\"":
|
|
return tokenize_string(s)
|
|
if c == "/":
|
|
return tokenize_regex(s)
|
|
if is_alpha(c):
|
|
return tokenize_identifier(s)
|
|
|
|
def tokenize_string(s):
|
|
s.advance() # ignore opening 2x-quote
|
|
current = ""
|
|
while s.peek() != "\"" and not s.exhausted():
|
|
current += s.advance()
|
|
if s.exhausted():
|
|
raise Exception("Unterminated string")
|
|
s.advance() # ignore closing 2x-quote
|
|
return ("STRING", current)
|
|
|
|
def tokenize_regex(s):
|
|
s.advance() # ignore opening forward-slash
|
|
current = ""
|
|
while s.peek() != "/" and not s.exhausted():
|
|
current += s.advance()
|
|
if s.exhausted():
|
|
raise Exception("Unterminated regex")
|
|
s.advance() # ignore closing forward-slash
|
|
return ("REGEX", current)
|
|
|
|
def tokenize_identifier(s):
|
|
keywords = {
|
|
"AND",
|
|
"OR",
|
|
}
|
|
current = s.advance()
|
|
while is_alphanumeric(s.peek()):
|
|
current += s.advance()
|
|
if current.upper() in keywords:
|
|
return ("KEYWORD", current.upper())
|
|
else:
|
|
return ("IDENTIFIER", current)
|
|
|
|
################################################################################
|
|
# Main
|
|
################################################################################
|
|
|
|
def main():
|
|
while True:
|
|
x = input("> ")
|
|
print(tokenize(x))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|