Write encoded XML parser and pretty-printer
Write a function that reads a string of compressed XML and outputs the decompressed version. Note to self: Now that I'm growing more comfortable writing parsers, I'd like to become equally comfortable writing pretty-printers.
This commit is contained in:
parent
bfd2180e6b
commit
c841527f61
2 changed files with 135 additions and 0 deletions
98
scratch/facebook/moderate/decompress-xml.py
Normal file
98
scratch/facebook/moderate/decompress-xml.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import string
|
||||
from parser import Parser
|
||||
|
||||
mapping = {
|
||||
1: "family",
|
||||
2: "person",
|
||||
3: "firstName",
|
||||
4: "lastName",
|
||||
5: "state",
|
||||
}
|
||||
|
||||
def parse_int(i, xs):
|
||||
result = ""
|
||||
while i < len(xs) and xs[i] in string.digits:
|
||||
result += xs[i]
|
||||
i += 1
|
||||
return i, int(result)
|
||||
|
||||
def parse_string(i, xs):
|
||||
result = ""
|
||||
while xs[i+1] not in string.digits:
|
||||
result += xs[i]
|
||||
i += 1
|
||||
return i, result
|
||||
|
||||
def tokenize(xs):
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(xs):
|
||||
if xs[i] in string.digits:
|
||||
i, n = parse_int(i, xs)
|
||||
result.append(n)
|
||||
elif xs[i] in string.ascii_letters:
|
||||
i, x = parse_string(i, xs)
|
||||
result.append(x)
|
||||
elif xs[i] == " ":
|
||||
i += 1
|
||||
continue
|
||||
return result
|
||||
|
||||
def parse(xs):
|
||||
parser = Parser(tokenize(xs))
|
||||
return parse_element(parser)
|
||||
|
||||
# Element -> Tag Attribute* End Element* End ;
|
||||
# Tag -> INTEGER ;
|
||||
# Value -> STRING End ;
|
||||
# Attribute -> Tag Value ;
|
||||
# End -> 0 ;
|
||||
|
||||
def parse_element(parser):
|
||||
if type(parser.curr()) == str:
|
||||
return parser.consume()
|
||||
tag_id = parser.expect_predicate(lambda x: type(x) == int)
|
||||
tag = mapping[tag_id]
|
||||
attrs = parse_attrs(parser)
|
||||
parser.expect([0])
|
||||
children = []
|
||||
while not parser.exhausted() and parser.curr() != 0:
|
||||
children.append(parse_element(parser))
|
||||
parser.expect([0])
|
||||
return [tag, attrs, children]
|
||||
|
||||
def parse_attrs(parser):
|
||||
result = []
|
||||
while parser.curr() != 0:
|
||||
tag_id = parser.expect_predicate(lambda x: type(x) == int)
|
||||
tag = mapping[tag_id]
|
||||
value = parser.consume()
|
||||
result.append((tag, value))
|
||||
return result
|
||||
|
||||
def stringify_xml(tree, indent=0):
|
||||
if type(tree) == str:
|
||||
return tree
|
||||
result = ""
|
||||
tag, attrs, children = tree
|
||||
|
||||
str_attrs = []
|
||||
for k, v in attrs:
|
||||
str_attrs.append("{}=\"{}\"".format(k, v))
|
||||
str_attrs = (" " if str_attrs else "") + " ".join(str_attrs)
|
||||
|
||||
str_children = []
|
||||
for child in children:
|
||||
str_children.append(" " * 2 * indent + stringify_xml(child, indent + 1))
|
||||
str_children = "\n".join(str_children)
|
||||
|
||||
result += "{}<{}{}>\n{}{}\n{}</{}>".format(
|
||||
" " * 2 * indent, tag, str_attrs, " " * 2 * indent, str_children,
|
||||
" " * 2 * indent, tag)
|
||||
return result
|
||||
|
||||
x = "1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0"
|
||||
print("Input: {}".format(x))
|
||||
print("Tokens: {}".format(tokenize(x)))
|
||||
print("Parsed: {}".format(parse(x)))
|
||||
print("{}".format(stringify_xml(parse(x))))
|
37
scratch/facebook/moderate/parser.py
Normal file
37
scratch/facebook/moderate/parser.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
class Parser(object):
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
self.i = 0
|
||||
|
||||
def prev(self):
|
||||
return self.tokens[self.i - 1]
|
||||
|
||||
def curr(self):
|
||||
return self.tokens[self.i]
|
||||
|
||||
def next(self):
|
||||
return self.tokens[self.i + 1]
|
||||
|
||||
def consume(self):
|
||||
if not self.exhausted():
|
||||
self.i += 1
|
||||
return self.prev()
|
||||
|
||||
def match(self, xs):
|
||||
if not self.exhausted() and self.curr() in xs:
|
||||
self.consume()
|
||||
return True
|
||||
return False
|
||||
|
||||
def expect(self, xs):
|
||||
if not self.match(xs):
|
||||
raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr()))
|
||||
return self.prev()
|
||||
|
||||
def expect_predicate(self, predicate):
|
||||
if predicate(self.curr()):
|
||||
return self.consume()
|
||||
raise Exception("Expected token \"{}\" to pass predicate, but it did not".format(self.curr()))
|
||||
|
||||
def exhausted(self):
|
||||
return self.i >= len(self.tokens)
|
Loading…
Reference in a new issue