tvl-depot/users/wpcarro/scratch/facebook/parsing/json.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

122 lines
3.1 KiB
Python
Raw Normal View History

from parser import Parser
# As an exercise to stress-test my understanding of recursive descent parsers,
# I'm attempting to write a JSON parser without referencing any existing BNF
# descriptions of JSON or existing JSON parser implementations.
#
# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
# that I wrote to describe my expected input:
#
# expression -> object
# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
# | array
# array -> '[' expression ( ',' expression )* ']'
# | literal
# literal -> STRING | INT
def tokenize(xs):
"""
Return a list of tokens from the string input, `xs`.
"""
result = []
i = 0
while i < len(xs):
# single characters
if xs[i] in ",{}:[]":
result.append(xs[i])
i += 1
# strings
elif xs[i] == "\"":
curr = xs[i]
i += 1
while xs[i] != "\"":
curr += xs[i]
i += 1
curr += xs[i]
result.append(curr)
i += 1
# integers
elif xs[i] in "0123456789":
curr = xs[i]
i += 1
while xs[i] in "0123456789":
curr += xs[i]
i += 1
result.append(int(curr))
# whitespace
elif xs[i] in {" ", "\n"}:
i += 1
return result
def parse_json(x):
"""
Attempt to parse the string, `x`, into JSON.
"""
tokens = tokenize(x)
return parse_object(Parser(tokens))
def parse_object(parser):
if parser.match(['{']):
key = parse_string(parser)
parser.expect([':'])
value = parse_object(parser)
result = [(key, value)]
while parser.match([',']):
key = parse_string(parser)
parser.match([':'])
value = parse_object(parser)
result.append((key, value))
return result
return parse_array(parser)
def parse_array(parser):
if parser.match(['[']):
if parser.match([']']):
return []
result = [parse_object(parser)]
while parser.match([',']):
result.append(parse_object(parser))
parser.expect([']'])
return result
else:
return parse_literal(parser)
def parse_string(parser):
if parser.curr().startswith("\""):
return parser.consume()
else:
raise Exception("Unexpected token: {}".format(parser.curr()))
def parse_literal(parser):
return parser.consume()
sample = """
{
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": [
"GML",
"XML"
]
},
"GlossSee": "markup"
}
}
}
}
}
"""
print(parse_json(sample))