diff --git a/docs/heckformat.md b/docs/heckformat.md index ecb6b3d..60b6759 100644 --- a/docs/heckformat.md +++ b/docs/heckformat.md @@ -4,7 +4,7 @@ %%% heck element value value element value value -element tag=value +element attribute=value > subelement value > subelement value >> sub-subelement value @@ -23,19 +23,23 @@ NUMBER ::= () STRING ::= "([^\"]*|(\\)|(\"))" VALUE ::= (||) VALUES ::= (\s+)? -TAGNAME ::= -TAG ::= = -TAGS ::= (\s+)? +ATTRIBUTENAME ::= +ATTRIBUTE ::= = +ATTRIBUTES ::= (\s+)? SECTIONLABEL ::= -SECTION ::= %%%\s+\s+ +SECTION ::= %%%\s+\s+ ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]? -ELEMENT ::= \s+(|) +ELEMENT ::= \s+(|) LINE ::= ^(((>)*) |
| ) (|$) ``` -Heck is composed of a series of elements each with a label and one or more values. Elements may also have several key-value pairs associated with them as tags. Elements may also have sub-elements which then take the place of the value indicated with a > before the value at the current level. The heck document also can have several sections. Sections all start with %%% and a label. If the label is heck, the section is interpreted as more elements for the heck document. Any other label is stored as an element of arbitrary string value under that label name. +Heck is composed of a series of elements each with a label and one or more values. Elements may also have several key-value pairs associated with them as attributes. Elements may also have sub-elements which then take the place of the value indicated with a > before the value at the current level. The heck document also can have several sections. Sections all start with %%% and a label. If the label is heck, the section is interpreted as more elements for the heck document. Any other label is stored as an element of arbitrary string value under that label name. The data structure represented is an ordered array. Elements may have the same name as previous elmenents in the same containment. + +## APIs + +Heckformat APIs should provide a way to iterate the elements, to access their attributes as a mapping, to access their values as an array, and collect all elements within the root (or another element) of a specific kind (or set of kinds) as an array. diff --git a/python/__init__.py b/python/heck/__init__.py similarity index 100% rename from python/__init__.py rename to python/heck/__init__.py diff --git a/python/heck/lexer.py b/python/heck/lexer.py new file mode 100644 index 0000000..1af81ba --- /dev/null +++ b/python/heck/lexer.py @@ -0,0 +1,97 @@ +import ply.lex as lex + +from typing import List, Optional + +import string + +tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP') + +# COMMENT ::= # .*$ +# ATOM ::= [A-Za-z_][A-Za-z0-9_-]? +# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])? +# BASE16NUMBER ::= 0x[0-9A-Fa-f]+ +# NUMBER ::= () +# STRING ::= "([^\"]*|(\\)|(\"))" +# VALUE ::= (||) +# VALUES ::= (\s+)? +# ATTRIBUTENAME ::= +# ATTRIBUTE ::= = +# ATTRIBUTES ::= (\s+)? +# SECTIONLABEL ::= +# SECTION ::= %%%\s+\s+ +# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]? +# ELEMENT ::= \s+(|) +# LINE ::= ^(((>)*) |
| ) (|$) + + +t_ignore = string.whitespace +t_DEEP = r'^(>)+' +t_ATOM = r'[A-Za-z_$][A-Za-z0-9_.-]*' +t_BASE16 = r'0x[0-9A-Fa-f]+' +t_SECTION = r'^%%%\s' +t_ATTRIB = '=' + +def t_BASE10(token: lex.LexToken): + r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)' + # python numbers are Very Flexible so we ignore typespec + vstr = token.value + if vstr[-1] in 'FLUIDCfluidc': + vstr = vstr[:-1] + if '.' in vstr: + token.value = float(vstr) + else: + token.value = int(vstr) + return token + +def t_COMMENT(token: lex.LexToken): + r'\#\s?.*$' + ... + +def t_STRING(token: lex.LexToken): + r'"[^"]*"' + token.value = token.value[1:-1] # substring to strip double quotes + return token + +def t_error(token: lex.LexToken): + print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.") + print('... ' + token.value) + print(' ^') + # token.lexer.skip(1) + +lexer = lex.lex() + +def lex_line(line, lineno=0) -> Optional[List[lex.LexToken]]: + lexer.lineno = lineno + try: + lexer.input(line) + tokens = [] + while True: + tok = lexer.token() + if tok: + tokens.append(tok) + else: + break + return tokens + except lex.LexError as inst: + # fixme raise a HeckFormat exception + raise inst + +TEST_STRINGS = [ + '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo', + '1.23f', + '"hello world!" atom utehuteu tnhoeun_etuhenuoh', + '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo', + '%%% heck', + '%%% markdown foo=bar', + 'element 1.2 1.3 1.4 attrib="string value for attribute"', + '> element 5 4 3 2.5', +] + +if __name__ == "__main__": + for idx, test in enumerate(TEST_STRINGS): + print(f"Line {idx}: '{test}'") + try: + for token in lex_line(test, idx): + print(' ' + str(token)) + except Exception as inst: + print(f'Error in line.') diff --git a/python/heck/parse.py b/python/heck/parse.py new file mode 100644 index 0000000..7437701 --- /dev/null +++ b/python/heck/parse.py @@ -0,0 +1,127 @@ +from typing import Iterable, Union, Mapping, TypeVar, List + +import re + +from parser import parser + +class HeckException (Exception): + ... + +class HeckParseException(HeckException): + ... + + +HeckValue = TypeVar("HeckElement") | str | int | float + +class HeckElement: + name: str + children: Iterable[TypeVar] + values: Iterable[HeckValue] + attributes: Mapping[str, HeckValue] + + + def __init__(self): + self.children = [] + self.values = [] + self.attributes = dict() + self.name = "" + self.unparsed = False + + def __str__(self): + k='' + if self.unparsed: + k='Unparsed ' + return f"" + + def __repr__(self): + return self.__str__() + +# COMMENT ::= # .*$ +# ATOM ::= [A-Za-z_][A-Za-z0-9_-]? +# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])? +# BASE16NUMBER ::= 0x[0-9A-Fa-f]+ +# NUMBER ::= () +# STRING ::= "([^\"]*|(\\)|(\"))" +# VALUE ::= (||) +# VALUES ::= (\s+)? +# ATTRIBUTENAME ::= +# ATTRIBUTE ::= = +# ATTRIBUTES ::= (\s+)? +# SECTIONLABEL ::= +# SECTION ::= %%%\s+\s+ +# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]? +# ELEMENT ::= \s+(|) +# LINE ::= ^(((>)*) |
| ) (|$) + + +# ATOM = re.compile(r'[A-Za-z_][A-Za-z0-9_-]*') + + +def get_element(ast: List) -> HeckElement: + if not (ast[0] == 'element'): + raise HeckParseException("Found a non-element where an element was expected.") + elm = HeckElement() + elm.name = ast[1]; + for item in ast[2:]: + if item[0] == 'values': + elm.values = [x[1] for x in item[1:]] + elif item[0] == 'attributes': + elm.attributes.update({x[1]: x[2][1] for x in item[1:]}) + return elm + +def load_heck(inp: Iterable[str]) -> HeckElement: + MODE_INIT = 0 + MODE_ELM = 1 + MODE_UNPARSE = 2 + + rootelm = HeckElement() + pelm = rootelm # parent for subelement + rootelm.name = "__ROOT__" + mode = MODE_INIT + for idx, line in enumerate(inp): + if mode == MODE_UNPARSE: + if (line.startswith('%%%')): + mode = MODE_INIT + else: + pelm.values.append(line) + continue + else: + ast = parser.parse(line) + if ast: + if ast[0] == 'section': + if ast[1] == 'heck': + mode = MODE_ELM + pelm = rootelm + else: + mode = MODE_UNPARSE + pelm = HeckElement() + rootelm.children.append(pelm) + pelm.name = ast[1] + pelm.unparsed = True + else: + if not mode == MODE_ELM: + raise HeckParseException("Didn't find heck preamble, line {idx}") + else: + pelm.children.append(get_element(ast)) + + return rootelm + + +TEST_HECK = """ +%%% heck +# Website! +title "My Website" bold=True +subtitle "Yep it's a website" +scale 3.72 +matrix 0 0 0 0 1 2 3 1 2 3 4 29394.2 +tags hey man what are you doin + +%%% markdown +# Some cheeky markdown to confuse our processing. + +All my page content goes here. +""" + +if __name__ == "__main__": + result = load_heck(TEST_HECK.split('\n')) + print(result) diff --git a/python/heck/parser.out b/python/heck/parser.out new file mode 100644 index 0000000..58f4673 --- /dev/null +++ b/python/heck/parser.out @@ -0,0 +1,333 @@ +Created by PLY version 3.11 (http://www.dabeaz.com/ply) + +Unused terminals: + + COMMENT + +Grammar + +Rule 0 S' -> statement +Rule 1 value -> BASE16 +Rule 2 value -> BASE10 +Rule 3 value -> STRING +Rule 4 value -> ATOM +Rule 5 attribute -> ATOM ATTRIB value +Rule 6 attributes -> attributes attribute +Rule 7 attributes -> attribute +Rule 8 section -> SECTION ATOM +Rule 9 section -> SECTION ATOM attributes +Rule 10 values -> values value +Rule 11 values -> value +Rule 12 element -> ATOM values +Rule 13 element -> ATOM values attributes +Rule 14 element -> ATOM attributes +Rule 15 statement -> element +Rule 16 statement -> DEEP element +Rule 17 statement -> section + +Terminals, with rules where they appear + +ATOM : 4 5 8 9 12 13 14 +ATTRIB : 5 +BASE10 : 2 +BASE16 : 1 +COMMENT : +DEEP : 16 +SECTION : 8 9 +STRING : 3 +error : + +Nonterminals, with rules where they appear + +attribute : 6 7 +attributes : 6 9 13 14 +element : 15 16 +section : 17 +statement : 0 +value : 5 10 11 +values : 10 12 13 + +Parsing method: LALR + +state 0 + + (0) S' -> . statement + (15) statement -> . element + (16) statement -> . DEEP element + (17) statement -> . section + (12) element -> . ATOM values + (13) element -> . ATOM values attributes + (14) element -> . ATOM attributes + (8) section -> . SECTION ATOM + (9) section -> . SECTION ATOM attributes + + DEEP shift and go to state 3 + ATOM shift and go to state 5 + SECTION shift and go to state 6 + + statement shift and go to state 1 + element shift and go to state 2 + section shift and go to state 4 + +state 1 + + (0) S' -> statement . + + + +state 2 + + (15) statement -> element . + + $end reduce using rule 15 (statement -> element .) + + +state 3 + + (16) statement -> DEEP . element + (12) element -> . ATOM values + (13) element -> . ATOM values attributes + (14) element -> . ATOM attributes + + ATOM shift and go to state 5 + + element shift and go to state 7 + +state 4 + + (17) statement -> section . + + $end reduce using rule 17 (statement -> section .) + + +state 5 + + (12) element -> ATOM . values + (13) element -> ATOM . values attributes + (14) element -> ATOM . attributes + (10) values -> . values value + (11) values -> . value + (6) attributes -> . attributes attribute + (7) attributes -> . attribute + (1) value -> . BASE16 + (2) value -> . BASE10 + (3) value -> . STRING + (4) value -> . ATOM + (5) attribute -> . ATOM ATTRIB value + + BASE16 shift and go to state 13 + BASE10 shift and go to state 14 + STRING shift and go to state 15 + ATOM shift and go to state 8 + + values shift and go to state 9 + attributes shift and go to state 10 + value shift and go to state 11 + attribute shift and go to state 12 + +state 6 + + (8) section -> SECTION . ATOM + (9) section -> SECTION . ATOM attributes + + ATOM shift and go to state 16 + + +state 7 + + (16) statement -> DEEP element . + + $end reduce using rule 16 (statement -> DEEP element .) + + +state 8 + + (4) value -> ATOM . + (5) attribute -> ATOM . ATTRIB value + + BASE16 reduce using rule 4 (value -> ATOM .) + BASE10 reduce using rule 4 (value -> ATOM .) + STRING reduce using rule 4 (value -> ATOM .) + ATOM reduce using rule 4 (value -> ATOM .) + $end reduce using rule 4 (value -> ATOM .) + ATTRIB shift and go to state 17 + + +state 9 + + (12) element -> ATOM values . + (13) element -> ATOM values . attributes + (10) values -> values . value + (6) attributes -> . attributes attribute + (7) attributes -> . attribute + (1) value -> . BASE16 + (2) value -> . BASE10 + (3) value -> . STRING + (4) value -> . ATOM + (5) attribute -> . ATOM ATTRIB value + + $end reduce using rule 12 (element -> ATOM values .) + BASE16 shift and go to state 13 + BASE10 shift and go to state 14 + STRING shift and go to state 15 + ATOM shift and go to state 8 + + attributes shift and go to state 18 + value shift and go to state 19 + attribute shift and go to state 12 + +state 10 + + (14) element -> ATOM attributes . + (6) attributes -> attributes . attribute + (5) attribute -> . ATOM ATTRIB value + + $end reduce using rule 14 (element -> ATOM attributes .) + ATOM shift and go to state 20 + + attribute shift and go to state 21 + +state 11 + + (11) values -> value . + + BASE16 reduce using rule 11 (values -> value .) + BASE10 reduce using rule 11 (values -> value .) + STRING reduce using rule 11 (values -> value .) + ATOM reduce using rule 11 (values -> value .) + $end reduce using rule 11 (values -> value .) + + +state 12 + + (7) attributes -> attribute . + + ATOM reduce using rule 7 (attributes -> attribute .) + $end reduce using rule 7 (attributes -> attribute .) + + +state 13 + + (1) value -> BASE16 . + + BASE16 reduce using rule 1 (value -> BASE16 .) + BASE10 reduce using rule 1 (value -> BASE16 .) + STRING reduce using rule 1 (value -> BASE16 .) + ATOM reduce using rule 1 (value -> BASE16 .) + $end reduce using rule 1 (value -> BASE16 .) + + +state 14 + + (2) value -> BASE10 . + + BASE16 reduce using rule 2 (value -> BASE10 .) + BASE10 reduce using rule 2 (value -> BASE10 .) + STRING reduce using rule 2 (value -> BASE10 .) + ATOM reduce using rule 2 (value -> BASE10 .) + $end reduce using rule 2 (value -> BASE10 .) + + +state 15 + + (3) value -> STRING . + + BASE16 reduce using rule 3 (value -> STRING .) + BASE10 reduce using rule 3 (value -> STRING .) + STRING reduce using rule 3 (value -> STRING .) + ATOM reduce using rule 3 (value -> STRING .) + $end reduce using rule 3 (value -> STRING .) + + +state 16 + + (8) section -> SECTION ATOM . + (9) section -> SECTION ATOM . attributes + (6) attributes -> . attributes attribute + (7) attributes -> . attribute + (5) attribute -> . ATOM ATTRIB value + + $end reduce using rule 8 (section -> SECTION ATOM .) + ATOM shift and go to state 20 + + attributes shift and go to state 22 + attribute shift and go to state 12 + +state 17 + + (5) attribute -> ATOM ATTRIB . value + (1) value -> . BASE16 + (2) value -> . BASE10 + (3) value -> . STRING + (4) value -> . ATOM + + BASE16 shift and go to state 13 + BASE10 shift and go to state 14 + STRING shift and go to state 15 + ATOM shift and go to state 23 + + value shift and go to state 24 + +state 18 + + (13) element -> ATOM values attributes . + (6) attributes -> attributes . attribute + (5) attribute -> . ATOM ATTRIB value + + $end reduce using rule 13 (element -> ATOM values attributes .) + ATOM shift and go to state 20 + + attribute shift and go to state 21 + +state 19 + + (10) values -> values value . + + BASE16 reduce using rule 10 (values -> values value .) + BASE10 reduce using rule 10 (values -> values value .) + STRING reduce using rule 10 (values -> values value .) + ATOM reduce using rule 10 (values -> values value .) + $end reduce using rule 10 (values -> values value .) + + +state 20 + + (5) attribute -> ATOM . ATTRIB value + + ATTRIB shift and go to state 17 + + +state 21 + + (6) attributes -> attributes attribute . + + ATOM reduce using rule 6 (attributes -> attributes attribute .) + $end reduce using rule 6 (attributes -> attributes attribute .) + + +state 22 + + (9) section -> SECTION ATOM attributes . + (6) attributes -> attributes . attribute + (5) attribute -> . ATOM ATTRIB value + + $end reduce using rule 9 (section -> SECTION ATOM attributes .) + ATOM shift and go to state 20 + + attribute shift and go to state 21 + +state 23 + + (4) value -> ATOM . + + ATOM reduce using rule 4 (value -> ATOM .) + $end reduce using rule 4 (value -> ATOM .) + + +state 24 + + (5) attribute -> ATOM ATTRIB value . + + ATOM reduce using rule 5 (attribute -> ATOM ATTRIB value .) + $end reduce using rule 5 (attribute -> ATOM ATTRIB value .) + diff --git a/python/heck/parser.py b/python/heck/parser.py new file mode 100644 index 0000000..6718350 --- /dev/null +++ b/python/heck/parser.py @@ -0,0 +1,102 @@ +import ply.yacc as yacc + +from lexer import tokens + +def p_value(p): + """ + value : BASE16 + | BASE10 + | STRING + | ATOM + """ + #print(p[0], p[1]) + p[0] = ("value", p[1]) + + +def p_attribute(p): + """attribute : ATOM ATTRIB value""" + # print(p[0], p[1]) + p[0] = ("attribute", p[1], p[3]) + + +def p_attributes(p): + """ + attributes : attributes attribute + attributes : attribute + """ + if len(p) == 2: + p[0] = ["attributes", p[1]] + else: + p[0] = p[1] + p[0].append(p[2]) + + +def p_section(p): + """ + section : SECTION ATOM + | SECTION ATOM attributes + """ + if (len(p) == 3): + p[0] = ("section", p[2]) + else: + p[0] = ("section", p[2], p[3]) + +def p_values(p): + """ + values : values value + values : value + """ + if len(p) == 2: + p[0] = ["values", p[1]] + else: + p[0] = p[1] + p[0].append(p[2]) + + +def p_element(p): + """ + element : ATOM values + | ATOM values attributes + | ATOM attributes + """ + # print(len(p)) + p[0] = ["element", p[1], p[2]] + if (len(p) == 4): + p[0].append(p[3]) + + +def p_statement(p): + """ + statement : element + | DEEP element + | section + """ + if (len(p) > 2): + p[0] = ('deep', p[2]) + else: + p[0] = p[1] + + +def p_error(p): + if not p: + return + else: + print("Syntax error {p}") + +parser = yacc.yacc(start="statement") + + +TEST_STRING = [ + '%%% heck', + '%%% heck foo=bar', + '%%% heck bar=-5l quux="hello! how are you today?" fred=69 barney=nice', + 'title "My website!"', + 'zoom 5.73', + 'tags yo fresh', + 'dumper 1 2 3 4 5 6 7 8 9 dumpped=True', + '> big_dumper 32 23 384848', +] + +if __name__ == "__main__": + for test in TEST_STRING: + print(parser.parse(test)) diff --git a/python/heck/parsetab.py b/python/heck/parsetab.py new file mode 100644 index 0000000..174f156 --- /dev/null +++ b/python/heck/parsetab.py @@ -0,0 +1,47 @@ + +# parsetab.py +# This file is automatically generated. Do not edit. +# pylint: disable=W,C,R +_tabversion = '3.10' + +_lr_method = 'LALR' + +_lr_signature = 'statementATOM ATTRIB BASE10 BASE16 COMMENT DEEP SECTION STRING\n value : BASE16\n | BASE10\n | STRING\n | ATOM\n attribute : ATOM ATTRIB value\n attributes : attributes attribute\n attributes : attribute\n \n section : SECTION ATOM\n | SECTION ATOM attributes\n \n values : values value\n values : value\n \n element : ATOM values\n | ATOM values attributes\n | ATOM attributes\n \n statement : element\n | DEEP element\n | section\n ' + +_lr_action_items = {'DEEP':([0,],[3,]),'ATOM':([0,3,5,6,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,],[5,5,8,16,-4,8,20,-11,-7,-1,-2,-3,20,23,20,-10,-6,20,-4,-5,]),'SECTION':([0,],[6,]),'$end':([1,2,4,7,8,9,10,11,12,13,14,15,16,18,19,21,22,23,24,],[0,-15,-17,-16,-4,-12,-14,-11,-7,-1,-2,-3,-8,-13,-10,-6,-9,-4,-5,]),'BASE16':([5,8,9,11,13,14,15,17,19,],[13,-4,13,-11,-1,-2,-3,13,-10,]),'BASE10':([5,8,9,11,13,14,15,17,19,],[14,-4,14,-11,-1,-2,-3,14,-10,]),'STRING':([5,8,9,11,13,14,15,17,19,],[15,-4,15,-11,-1,-2,-3,15,-10,]),'ATTRIB':([8,20,],[17,17,]),} + +_lr_action = {} +for _k, _v in _lr_action_items.items(): + for _x,_y in zip(_v[0],_v[1]): + if not _x in _lr_action: _lr_action[_x] = {} + _lr_action[_x][_k] = _y +del _lr_action_items + +_lr_goto_items = {'statement':([0,],[1,]),'element':([0,3,],[2,7,]),'section':([0,],[4,]),'values':([5,],[9,]),'attributes':([5,9,16,],[10,18,22,]),'value':([5,9,17,],[11,19,24,]),'attribute':([5,9,10,16,18,22,],[12,12,21,12,21,21,]),} + +_lr_goto = {} +for _k, _v in _lr_goto_items.items(): + for _x, _y in zip(_v[0], _v[1]): + if not _x in _lr_goto: _lr_goto[_x] = {} + _lr_goto[_x][_k] = _y +del _lr_goto_items +_lr_productions = [ + ("S' -> statement","S'",1,None,None,None), + ('value -> BASE16','value',1,'p_value','parser.py',7), + ('value -> BASE10','value',1,'p_value','parser.py',8), + ('value -> STRING','value',1,'p_value','parser.py',9), + ('value -> ATOM','value',1,'p_value','parser.py',10), + ('attribute -> ATOM ATTRIB value','attribute',3,'p_attribute','parser.py',17), + ('attributes -> attributes attribute','attributes',2,'p_attributes','parser.py',24), + ('attributes -> attribute','attributes',1,'p_attributes','parser.py',25), + ('section -> SECTION ATOM','section',2,'p_section','parser.py',36), + ('section -> SECTION ATOM attributes','section',3,'p_section','parser.py',37), + ('values -> values value','values',2,'p_values','parser.py',46), + ('values -> value','values',1,'p_values','parser.py',47), + ('element -> ATOM values','element',2,'p_element','parser.py',58), + ('element -> ATOM values attributes','element',3,'p_element','parser.py',59), + ('element -> ATOM attributes','element',2,'p_element','parser.py',60), + ('statement -> element','statement',1,'p_statement','parser.py',70), + ('statement -> DEEP element','statement',2,'p_statement','parser.py',71), + ('statement -> section','statement',1,'p_statement','parser.py',72), +]