feat(wpcarro/simple-select): support tokenizer for query language

Support a tokenizer for a query language that looks like: ``` -fname:/W.*m/ lname:"Von Carroll" ``` Parser otw... Change-Id: I2badf14a41313ca2f75dec20adbcf9031b22ab83 Reviewed-on: https://cl.tvl.fyi/c/depot/+/5338 Reviewed-by: wpcarro <wpcarro@gmail.com> Autosubmit: wpcarro <wpcarro@gmail.com> Tested-by: BuildkiteCI
2022-02-28 11:59:55 -08:00 · 2022-02-28 11:59:55 -08:00 · 88a3051ae5
commit 88a3051ae5
parent 7770ccf0e3
2 changed files with 88 additions and 5 deletions
--- a/users/wpcarro/scratch/simple-select/main.py
+++ b/users/wpcarro/scratch/simple-select/main.py
@ -1,8 +1,91 @@
+import string
 from scanner import Scanner
+################################################################################
+# Predicates
+################################################################################
+
+def is_alpha(c):
+  return c in string.ascii_letters
+
+def is_digit(c):
+  return c in "0123456789"
+
+def is_alphanumeric(c):
+  return is_alpha(c) or is_digit(c)
+
+def is_whitespace(c):
+  return c in " \r\t\n"
+
+################################################################################
+# Tokenizer
+################################################################################

 def tokenize(x):
  s = Scanner(x)
-  return None
+  tokens = scan_tokens(s)
+  return tokens
+
+def scan_tokens(s):
+  result = []
+  while not s.exhausted():
+    if is_whitespace(s.peek()):
+      s.advance()
+    else:
+      result.append(scan_token(s))
+  return result
+
+def scan_token(s):
+  punctuation = {
+      "-": "NOT",
+      ":": "COLON",
+  }
+  c = s.peek()
+  if c in punctuation:
+    s.advance()
+    return punctuation[c]
+  if c == "\"":
+    return tokenize_string(s)
+  if c == "/":
+    return tokenize_regex(s)
+  if is_alpha(c):
+    return tokenize_identifier(s)
+
+def tokenize_string(s):
+  s.advance() # ignore opening 2x-quote
+  current = ""
+  while s.peek() != "\"" and not s.exhausted():
+    current += s.advance()
+  if s.exhausted():
+    raise Exception("Unterminated string")
+  s.advance() # ignore closing 2x-quote
+  return ("STRING", current)
+
+def tokenize_regex(s):
+  s.advance() # ignore opening forward-slash
+  current = ""
+  while s.peek() != "/" and not s.exhausted():
+    current += s.advance()
+  if s.exhausted():
+    raise Exception("Unterminated regex")
+  s.advance() # ignore closing forward-slash
+  return ("REGEX", current)
+
+def tokenize_identifier(s):
+  keywords = {
+      "AND",
+      "OR",
+  }
+  current = s.advance()
+  while is_alphanumeric(s.peek()):
+    current += s.advance()
+  if current.upper() in keywords:
+    return ("KEYWORD", current.upper())
+  else:
+    return ("IDENTIFIER", current)
+
+################################################################################
+# Main
+################################################################################

 def main():
  while True:
--- a/users/wpcarro/scratch/simple-select/scanner.py
+++ b/users/wpcarro/scratch/simple-select/scanner.py
@ -2,15 +2,15 @@
 # scanner/lexer needs are peek and advance; other functions (e.g. match) are
 # nice-to-haves.
 class Scanner(object):
-  def __init__(self, source):
+  def __init__(self, chars):
    self.i = 0
-    self.source = source
+    self.chars = chars

  def exhausted(self):
-    return self.i >= len(self.source)
+    return self.i >= len(self.chars)

  def peek(self, n=0):
-    return self.source[self.i + n] if self.i + n < len(self.source) else '\0'
+    return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0'

  def advance(self):
    result = self.peek()