diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7e60618 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[project] +name = "heckformat" +dynamic = ["version"] +description = "A simple format for configuration and content storage." +authors = [{name = "Cassowary", email="cassowary@aldercone.studio"}] +dependencies = ["ply>=3.1"] +requires-python = ">=3.8" +readme = "README.md" +license = {text = "LICENSE"} + +[tool.pdm.build] +package-dir = "python" + +[tool.pdm.version] +source = "file" +path = "python/heckformat/__init__.py" diff --git a/python/heckformat/__init__.py b/python/heckformat/__init__.py new file mode 100644 index 0000000..5f284fc --- /dev/null +++ b/python/heckformat/__init__.py @@ -0,0 +1,2 @@ + +__version__ = "0.0.1" diff --git a/python/heckformat/exceptions.py b/python/heckformat/exceptions.py new file mode 100644 index 0000000..389e113 --- /dev/null +++ b/python/heckformat/exceptions.py @@ -0,0 +1,20 @@ +""" +Exceptions for HECKfile processing. +""" + +class HeckException (BaseException): + """ + Base exception for HECKfile processing. + """ + + +class HeckParseException(HeckException): + """ + Raised for parse errors specifically. + """ + + +class HeckLexException(HeckException): + """ + Raised for lex errors specifically. + """ diff --git a/python/heckformat/lexer.py b/python/heckformat/lexer.py new file mode 100644 index 0000000..c02d8cc --- /dev/null +++ b/python/heckformat/lexer.py @@ -0,0 +1,119 @@ +import ply.lex as lex + +""" +Lexical analyzer for HECKformat lines using PLY Lex. +""" + +from .exceptions import HeckLexException + +from typing import List, Optional + +import string + +tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP', 'ELEMENT') + +# COMMENT ::= # .*$ +# ATOM ::= [A-Za-z_][A-Za-z0-9_-]? +# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])? +# BASE16NUMBER ::= 0x[0-9A-Fa-f]+ +# NUMBER ::= () +# STRING ::= "([^\"]*|(\\)|(\"))" +# VALUE ::= (||) +# VALUES ::= (\s+)? +# ATTRIBUTENAME ::= +# ATTRIBUTE ::= = +# ATTRIBUTES ::= (\s+)? +# SECTIONLABEL ::= +# SECTION ::= %%%\s+\s+ +# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]? +# ELEMENT ::= \s+(|) +# LINE ::= ^(((>)*) |
| ) (|$) + + +t_ignore = string.whitespace + +t_DEEP = r'^(>)+' + +t_BASE16 = r'0x[0-9A-Fa-f]+' +t_SECTION = r'^%%%\s' +t_ATTRIB = '=' +t_ELEMENT = r'[A-Za-z_.][A-Za-z0-9.!@\$%^&*()_+/\\-]*' + +def t_ATOM(token: lex.LexToken): + r'[A-Za-z_$][A-Za-z0-9_.-]*' + if token.value in ('true', 'True'): + token.value = True + elif token.value in ('false', 'False'): + token.value = False + return token + + + +def t_BASE10(token: lex.LexToken): + r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)' + # python numbers are Very Flexible so we ignore typespec + vstr = token.value + if vstr[-1] in 'FLUIDCfluidc': + vstr = vstr[:-1] + if '.' in vstr: + token.value = float(vstr) + else: + token.value = int(vstr) + return token + +def t_COMMENT(token: lex.LexToken): + r'\#\s?.*$' + ... + +def t_STRING(token: lex.LexToken): + r'"[^"]*"' + token.value = token.value[1:-1] # substring to strip double quotes + return token + +def t_error(token: lex.LexToken): + print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.") + print('... ' + token.value) + print(' ^') + # token.lexer.skip(1) + +lexer = lex.lex() + +def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]: + """ + Return a list of tokens for a particular HECKformat file line. + + """ + lexer.lineno = lineno + try: + lexer.input(line) + tokens = [] + while True: + tok = lexer.token() + if tok: + tokens.append(tok) + else: + break + return tokens + except lex.LexError as inst: + # fixme raise a HeckFormat exception + raise HeckLexException from inst + +TEST_STRINGS = [ + '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo', + '1.23f', + '"hello world!" atom utehuteu tnhoeun_etuhenuoh', + '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo', + '%%% heck', + '%%% markdown foo=bar', + 'element 1.2 1.3 1.4 attrib="string value for attribute"', + '> element 5 4 3 2.5', +] + +if __name__ == "__main__": + for idx, test in enumerate(TEST_STRINGS): + print(f"Line {idx}: '{test}'") + try: + for token in lex_line(test, idx): + print(' ' + str(token)) + except Exception as inst: + print(f'Error in line.') diff --git a/python/heckformat/parse.py b/python/heckformat/parse.py new file mode 100644 index 0000000..19fa90b --- /dev/null +++ b/python/heckformat/parse.py @@ -0,0 +1,174 @@ +from typing import Iterable, Union, Mapping, TypeVar, List, TextIO, Any +import collections.abc + + +import re + +from .parser import parser +from .exceptions import HeckParseException + +HeckValue = TypeVar("HeckElement") | str | int | float + +class HeckElement: + """ + Container for a tree of HECKformat elements. + """ + name: str + """The name of the element, either __ROOT__ for top level or whatever is specified in file.""" + children: Iterable[TypeVar] + """The children of the element.""" + values: Iterable[HeckValue] + """One or more values associated with the element.""" + attributes: Mapping[str, HeckValue] + """Zero or more attributes associated with the element as a key-value pair.""" + + def __init__(self): + self.children = [] + self.values = [] + self.attributes = dict() + self.name = "" + self.unparsed = False + + def flatten(self) -> Mapping: + output = {} + for elm in self.children: + elmval = [] + if elm.unparsed: + nam = '%%%UNPARSED%%% '+elm.name + val = '\n'.join(elm.values) + if nam in output: + output[nam] = '\n'.join([output[nam], val]) + else: + output[nam] = val + else: + if len(elm.children): + elmval.append(elm.flatten()) + elmval.extend(elm.values) + if elm.name in output: + output[elm.name].extend(elmval) + else: + output[elm.name] = elmval + return output + + def __str__(self): + k='' + if self.unparsed: + k='Unparsed ' + return f"" + + def __repr__(self): + return self.__str__() + + +def _make_element(ast: List) -> HeckElement: + """ + Get an element from an element AST from the parser. + """ + if not (ast[0] == 'element'): + raise HeckParseException(f"Found a non-element where an element was expected. {ast}") + elm = HeckElement() + elm.name = ast[1]; + for item in ast[2:]: + if item[0] == 'values': + elm.values = [x[1] for x in item[1:]] + elif item[0] == 'attributes': + elm.attributes.update({x[1]: x[2][1] for x in item[1:]}) + return elm + +def load_heck(inp: Iterable[str]) -> HeckElement: + """ + Load a HECKformat into a tree of HeckElements from a list of lines from the file. + """ + MODE_INIT = 0 + MODE_ELM = 1 + MODE_UNPARSE = 2 + + rootelm = HeckElement() + pelm = [rootelm] # parent for subelement + pdepth = 0 + depth = 0 + rootelm.name = "__ROOT__" + mode = MODE_INIT + for idx, line in enumerate(inp): + if mode == MODE_UNPARSE: + if (line.startswith('%%%')): + mode = MODE_INIT + else: + pelm[-1].values.append(line) + continue + else: + ast = parser.parse(line) + if ast: + if ast[0] == 'section': + if ast[1] == 'heck': + mode = MODE_ELM + pelm = [rootelm] + else: + mode = MODE_UNPARSE + pelm = [HeckElement()] + rootelm.children.append(pelm[-1]) + pelm[-1].name = ast[1] + pelm[-1].unparsed = True + else: + if not mode == MODE_ELM: + raise HeckParseException("Didn't find heck preamble, line {idx}") + else: + if ast[0] == 'deep': + # we're in a subitem + depth = ast[1] + if (depth > pdepth): + # are we deeper than last time? + try: + pelm.append(pelm[-1].children[-1]) + except: + raise HeckParseException("Tried to go deeper without a previous element, line {idx}") + elif (depth < pdepth): + # are we shallower than last time? + pelm.pop() + if (not len(pelm)): + raise HeckParseException("Tried to go shallower while already shallow, line {idx}") + ast = ast[2] + pdepth = depth + elif (pdepth > 0): + # we're no longer deep, just pop up to the top + pdepth = 0 + pelm = [rootelm] + pelm[-1].children.append(_make_element(ast)) + + return rootelm + +def load(infile: TextIO) -> HeckElement: + return load_heck(infile.readlines()) + +def loads(ins: str) -> HeckElement: + return load_heck(re.split(r'\n|\r|\r\n', ins)) + + +TEST_HECK = """ +%%% heck +# Website! +title "My Website" bold=True +subtitle "Yep it's a website" +scale 3.72 +matrix 0 0 0 0 1 2 3 1 2 3 4 29394.2 +tags hey man what are you doin +> more tag tag tag 1 2 3 +>> we can go deeper +>>> we can go even deeper +test +> _val 1 +> _val 2 +> _val 3 +valueless +_more.orless complexelement +.yooooo +boolean True +%%% markdown +# Some cheeky markdown to confuse our processing. + +All my page content goes here. +""" + +if __name__ == "__main__": + result = load_heck(TEST_HECK.split('\n')) + print(result) diff --git a/python/heckformat/parser.py b/python/heckformat/parser.py new file mode 100644 index 0000000..15e9fd4 --- /dev/null +++ b/python/heckformat/parser.py @@ -0,0 +1,118 @@ +import ply.yacc as yacc + +""" +Parser for HECKformat lines using PLY Parser. +""" + +from .lexer import tokens + +def p_value(p): + """ + value : BASE16 + | BASE10 + | STRING + | ATOM + """ + #print(p[0], p[1]) + p[0] = ("value", p[1]) + + +def p_elm(p): + """ + elm : ATOM + | ELEMENT + """ + p[0] = p[1] + +def p_attribute(p): + """attribute : ATOM ATTRIB value""" + # print(p[0], p[1]) + p[0] = ("attribute", p[1], p[3]) + + +def p_attributes(p): + """ + attributes : attributes attribute + attributes : attribute + """ + if len(p) == 2: + p[0] = ["attributes", p[1]] + else: + p[0] = p[1] + p[0].append(p[2]) + + +def p_section(p): + """ + section : SECTION elm + | SECTION elm attributes + """ + if (len(p) == 3): + p[0] = ("section", p[2]) + else: + p[0] = ("section", p[2], p[3]) + +def p_values(p): + """ + values : values value + values : value + """ + if len(p) == 2: + p[0] = ["values", p[1]] + else: + p[0] = p[1] + p[0].append(p[2]) + + +def p_element(p): + """ + element : elm values + | elm values attributes + | elm attributes + | elm + """ + # print(len(p)) + if len(p) <= 2: + p[0] = ["element", p[1]] + else: + p[0] = ["element", p[1], p[2]] + if (len(p) == 4): + p[0].append(p[3]) + + +def p_statement(p): + """ + statement : element + | DEEP element + | section + """ + if (len(p) > 2): + p[0] = ('deep', len(p[1]), p[2]) + else: + p[0] = p[1] + + +def p_error(p): + if not p: + return + else: + print(f"Syntax error {p}") + +parser = yacc.yacc(start="statement") + + +TEST_STRING = [ + '%%% heck', + '%%% heck foo=bar', + '%%% heck bar=-5l quux="hello! how are you today?" fred=69 barney=nice', + 'title "My website!"', + 'zoom 5.73', + 'tags yo fresh', + 'dumper 1 2 3 4 5 6 7 8 9 dumpped=True', + '> big_dumper 32 23 384848', + '>> deep_dumper 1 2 3 a=false' +] + +if __name__ == "__main__": + for test in TEST_STRING: + print(parser.parse(test)) diff --git a/python/heckformat/parsetab.py b/python/heckformat/parsetab.py new file mode 100644 index 0000000..32ad143 --- /dev/null +++ b/python/heckformat/parsetab.py @@ -0,0 +1,50 @@ + +# parsetab.py +# This file is automatically generated. Do not edit. +# pylint: disable=W,C,R +_tabversion = '3.10' + +_lr_method = 'LALR' + +_lr_signature = 'statementATOM ATTRIB BASE10 BASE16 COMMENT DEEP ELEMENT SECTION STRING\n value : BASE16\n | BASE10\n | STRING\n | ATOM\n \n elm : ATOM\n | ELEMENT\n attribute : ATOM ATTRIB value\n attributes : attributes attribute\n attributes : attribute\n \n section : SECTION elm\n | SECTION elm attributes\n \n values : values value\n values : value\n \n element : elm values\n | elm values attributes\n | elm attributes\n | elm\n \n statement : element\n | DEEP element\n | section\n ' + +_lr_action_items = {'DEEP':([0,],[3,]),'SECTION':([0,],[6,]),'ATOM':([0,3,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,],[7,7,17,7,-5,-6,17,22,-13,-9,-1,-2,-3,-4,22,22,-12,-8,25,22,-4,-7,]),'ELEMENT':([0,3,6,],[8,8,8,]),'$end':([1,2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,24,25,26,],[0,-18,-20,-17,-5,-6,-19,-14,-16,-13,-9,-1,-2,-3,-4,-10,-15,-12,-8,-11,-4,-7,]),'BASE16':([5,7,8,10,12,14,15,16,17,20,23,],[14,-5,-6,14,-13,-1,-2,-3,-4,-12,14,]),'BASE10':([5,7,8,10,12,14,15,16,17,20,23,],[15,-5,-6,15,-13,-1,-2,-3,-4,-12,15,]),'STRING':([5,7,8,10,12,14,15,16,17,20,23,],[16,-5,-6,16,-13,-1,-2,-3,-4,-12,16,]),'ATTRIB':([17,22,],[23,23,]),} + +_lr_action = {} +for _k, _v in _lr_action_items.items(): + for _x,_y in zip(_v[0],_v[1]): + if not _x in _lr_action: _lr_action[_x] = {} + _lr_action[_x][_k] = _y +del _lr_action_items + +_lr_goto_items = {'statement':([0,],[1,]),'element':([0,3,],[2,9,]),'section':([0,],[4,]),'elm':([0,3,6,],[5,5,18,]),'values':([5,],[10,]),'attributes':([5,10,18,],[11,19,24,]),'value':([5,10,23,],[12,20,26,]),'attribute':([5,10,11,18,19,24,],[13,13,21,13,21,21,]),} + +_lr_goto = {} +for _k, _v in _lr_goto_items.items(): + for _x, _y in zip(_v[0], _v[1]): + if not _x in _lr_goto: _lr_goto[_x] = {} + _lr_goto[_x][_k] = _y +del _lr_goto_items +_lr_productions = [ + ("S' -> statement","S'",1,None,None,None), + ('value -> BASE16','value',1,'p_value','parser.py',11), + ('value -> BASE10','value',1,'p_value','parser.py',12), + ('value -> STRING','value',1,'p_value','parser.py',13), + ('value -> ATOM','value',1,'p_value','parser.py',14), + ('elm -> ATOM','elm',1,'p_elm','parser.py',22), + ('elm -> ELEMENT','elm',1,'p_elm','parser.py',23), + ('attribute -> ATOM ATTRIB value','attribute',3,'p_attribute','parser.py',27), + ('attributes -> attributes attribute','attributes',2,'p_attributes','parser.py',34), + ('attributes -> attribute','attributes',1,'p_attributes','parser.py',35), + ('section -> SECTION elm','section',2,'p_section','parser.py',46), + ('section -> SECTION elm attributes','section',3,'p_section','parser.py',47), + ('values -> values value','values',2,'p_values','parser.py',56), + ('values -> value','values',1,'p_values','parser.py',57), + ('element -> elm values','element',2,'p_element','parser.py',68), + ('element -> elm values attributes','element',3,'p_element','parser.py',69), + ('element -> elm attributes','element',2,'p_element','parser.py',70), + ('element -> elm','element',1,'p_element','parser.py',71), + ('statement -> element','statement',1,'p_statement','parser.py',84), + ('statement -> DEEP element','statement',2,'p_statement','parser.py',85), + ('statement -> section','statement',1,'p_statement','parser.py',86), +]