Initial checkin of Python parser for heckfiles.

This commit is contained in:
Cassowary 2024-01-31 08:30:20 -08:00
parent e1bfcb4815
commit c69778d7e4
7 changed files with 717 additions and 7 deletions

View File

@ -4,7 +4,7 @@
%%% heck
element value value
element value value
element tag=value
element attribute=value
> subelement value
> subelement value
>> sub-subelement value
@ -23,19 +23,23 @@ NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
STRING ::= "([^\"]*|(\\)|(\"))"
VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
VALUES ::= <VALUE>(\s+<VALUES>)?
TAGNAME ::= <ATOM>
TAG ::= <TAGNAME>=<VALUE>
TAGS ::= <TAG>(\s+<TAGS>)?
ATTRIBUTENAME ::= <ATOM>
ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
SECTIONLABEL ::= <ATOM>
SECTION ::= %%%\s+<SECTIONLABEL>\s+<TAGS>
SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<TAGS>)
ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
```
Heck is composed of a series of elements each with a label and one or more values. Elements may also have several key-value pairs associated with them as tags. Elements may also have sub-elements which then take the place of the value indicated with a > before the value at the current level. The heck document also can have several sections. Sections all start with %%% and a label. If the label is heck, the section is interpreted as more elements for the heck document. Any other label is stored as an element of arbitrary string value under that label name.
Heck is composed of a series of elements each with a label and one or more values. Elements may also have several key-value pairs associated with them as attributes. Elements may also have sub-elements which then take the place of the value indicated with a > before the value at the current level. The heck document also can have several sections. Sections all start with %%% and a label. If the label is heck, the section is interpreted as more elements for the heck document. Any other label is stored as an element of arbitrary string value under that label name.
The data structure represented is an ordered array. Elements may have the same name as previous elmenents in the same containment.
## APIs
Heckformat APIs should provide a way to iterate the elements, to access their attributes as a mapping, to access their values as an array, and collect all elements within the root (or another element) of a specific kind (or set of kinds) as an array.

97
python/heck/lexer.py Normal file
View File

@ -0,0 +1,97 @@
import ply.lex as lex
from typing import List, Optional
import string
tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')
# COMMENT ::= # .*$
# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?
# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
# BASE16NUMBER ::= 0x[0-9A-Fa-f]+
# NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
# STRING ::= "([^\"]*|(\\)|(\"))"
# VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
# VALUES ::= <VALUE>(\s+<VALUES>)?
# ATTRIBUTENAME ::= <ATOM>
# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
# SECTIONLABEL ::= <ATOM>
# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
# LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
t_ignore = string.whitespace
t_DEEP = r'^(>)+'
t_ATOM = r'[A-Za-z_$][A-Za-z0-9_.-]*'
t_BASE16 = r'0x[0-9A-Fa-f]+'
t_SECTION = r'^%%%\s'
t_ATTRIB = '='
def t_BASE10(token: lex.LexToken):
r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
# python numbers are Very Flexible so we ignore typespec
vstr = token.value
if vstr[-1] in 'FLUIDCfluidc':
vstr = vstr[:-1]
if '.' in vstr:
token.value = float(vstr)
else:
token.value = int(vstr)
return token
def t_COMMENT(token: lex.LexToken):
r'\#\s?.*$'
...
def t_STRING(token: lex.LexToken):
r'"[^"]*"'
token.value = token.value[1:-1] # substring to strip double quotes
return token
def t_error(token: lex.LexToken):
print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
print('... ' + token.value)
print(' ^')
# token.lexer.skip(1)
lexer = lex.lex()
def lex_line(line, lineno=0) -> Optional[List[lex.LexToken]]:
lexer.lineno = lineno
try:
lexer.input(line)
tokens = []
while True:
tok = lexer.token()
if tok:
tokens.append(tok)
else:
break
return tokens
except lex.LexError as inst:
# fixme raise a HeckFormat exception
raise inst
TEST_STRINGS = [
'"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
'1.23f',
'"hello world!" atom utehuteu tnhoeun_etuhenuoh',
'"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
'%%% heck',
'%%% markdown foo=bar',
'element 1.2 1.3 1.4 attrib="string value for attribute"',
'> element 5 4 3 2.5',
]
if __name__ == "__main__":
for idx, test in enumerate(TEST_STRINGS):
print(f"Line {idx}: '{test}'")
try:
for token in lex_line(test, idx):
print(' ' + str(token))
except Exception as inst:
print(f'Error in line.')

127
python/heck/parse.py Normal file
View File

@ -0,0 +1,127 @@
from typing import Iterable, Union, Mapping, TypeVar, List
import re
from parser import parser
class HeckException (Exception):
...
class HeckParseException(HeckException):
...
HeckValue = TypeVar("HeckElement") | str | int | float
class HeckElement:
name: str
children: Iterable[TypeVar]
values: Iterable[HeckValue]
attributes: Mapping[str, HeckValue]
def __init__(self):
self.children = []
self.values = []
self.attributes = dict()
self.name = ""
self.unparsed = False
def __str__(self):
k=''
if self.unparsed:
k='Unparsed '
return f"<HeckElement {k}{self.name} c={self.children} v={self.values} a={self.attributes}>"
def __repr__(self):
return self.__str__()
# COMMENT ::= # .*$
# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?
# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
# BASE16NUMBER ::= 0x[0-9A-Fa-f]+
# NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
# STRING ::= "([^\"]*|(\\)|(\"))"
# VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
# VALUES ::= <VALUE>(\s+<VALUES>)?
# ATTRIBUTENAME ::= <ATOM>
# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
# SECTIONLABEL ::= <ATOM>
# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
# LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
# ATOM = re.compile(r'[A-Za-z_][A-Za-z0-9_-]*')
def get_element(ast: List) -> HeckElement:
if not (ast[0] == 'element'):
raise HeckParseException("Found a non-element where an element was expected.")
elm = HeckElement()
elm.name = ast[1];
for item in ast[2:]:
if item[0] == 'values':
elm.values = [x[1] for x in item[1:]]
elif item[0] == 'attributes':
elm.attributes.update({x[1]: x[2][1] for x in item[1:]})
return elm
def load_heck(inp: Iterable[str]) -> HeckElement:
MODE_INIT = 0
MODE_ELM = 1
MODE_UNPARSE = 2
rootelm = HeckElement()
pelm = rootelm # parent for subelement
rootelm.name = "__ROOT__"
mode = MODE_INIT
for idx, line in enumerate(inp):
if mode == MODE_UNPARSE:
if (line.startswith('%%%')):
mode = MODE_INIT
else:
pelm.values.append(line)
continue
else:
ast = parser.parse(line)
if ast:
if ast[0] == 'section':
if ast[1] == 'heck':
mode = MODE_ELM
pelm = rootelm
else:
mode = MODE_UNPARSE
pelm = HeckElement()
rootelm.children.append(pelm)
pelm.name = ast[1]
pelm.unparsed = True
else:
if not mode == MODE_ELM:
raise HeckParseException("Didn't find heck preamble, line {idx}")
else:
pelm.children.append(get_element(ast))
return rootelm
TEST_HECK = """
%%% heck
# Website!
title "My Website" bold=True
subtitle "Yep it's a website"
scale 3.72
matrix 0 0 0 0 1 2 3 1 2 3 4 29394.2
tags hey man what are you doin
%%% markdown
# Some cheeky markdown to confuse our processing.
All my page content goes here.
"""
if __name__ == "__main__":
result = load_heck(TEST_HECK.split('\n'))
print(result)

333
python/heck/parser.out Normal file
View File

@ -0,0 +1,333 @@
Created by PLY version 3.11 (http://www.dabeaz.com/ply)
Unused terminals:
COMMENT
Grammar
Rule 0 S' -> statement
Rule 1 value -> BASE16
Rule 2 value -> BASE10
Rule 3 value -> STRING
Rule 4 value -> ATOM
Rule 5 attribute -> ATOM ATTRIB value
Rule 6 attributes -> attributes attribute
Rule 7 attributes -> attribute
Rule 8 section -> SECTION ATOM
Rule 9 section -> SECTION ATOM attributes
Rule 10 values -> values value
Rule 11 values -> value
Rule 12 element -> ATOM values
Rule 13 element -> ATOM values attributes
Rule 14 element -> ATOM attributes
Rule 15 statement -> element
Rule 16 statement -> DEEP element
Rule 17 statement -> section
Terminals, with rules where they appear
ATOM : 4 5 8 9 12 13 14
ATTRIB : 5
BASE10 : 2
BASE16 : 1
COMMENT :
DEEP : 16
SECTION : 8 9
STRING : 3
error :
Nonterminals, with rules where they appear
attribute : 6 7
attributes : 6 9 13 14
element : 15 16
section : 17
statement : 0
value : 5 10 11
values : 10 12 13
Parsing method: LALR
state 0
(0) S' -> . statement
(15) statement -> . element
(16) statement -> . DEEP element
(17) statement -> . section
(12) element -> . ATOM values
(13) element -> . ATOM values attributes
(14) element -> . ATOM attributes
(8) section -> . SECTION ATOM
(9) section -> . SECTION ATOM attributes
DEEP shift and go to state 3
ATOM shift and go to state 5
SECTION shift and go to state 6
statement shift and go to state 1
element shift and go to state 2
section shift and go to state 4
state 1
(0) S' -> statement .
state 2
(15) statement -> element .
$end reduce using rule 15 (statement -> element .)
state 3
(16) statement -> DEEP . element
(12) element -> . ATOM values
(13) element -> . ATOM values attributes
(14) element -> . ATOM attributes
ATOM shift and go to state 5
element shift and go to state 7
state 4
(17) statement -> section .
$end reduce using rule 17 (statement -> section .)
state 5
(12) element -> ATOM . values
(13) element -> ATOM . values attributes
(14) element -> ATOM . attributes
(10) values -> . values value
(11) values -> . value
(6) attributes -> . attributes attribute
(7) attributes -> . attribute
(1) value -> . BASE16
(2) value -> . BASE10
(3) value -> . STRING
(4) value -> . ATOM
(5) attribute -> . ATOM ATTRIB value
BASE16 shift and go to state 13
BASE10 shift and go to state 14
STRING shift and go to state 15
ATOM shift and go to state 8
values shift and go to state 9
attributes shift and go to state 10
value shift and go to state 11
attribute shift and go to state 12
state 6
(8) section -> SECTION . ATOM
(9) section -> SECTION . ATOM attributes
ATOM shift and go to state 16
state 7
(16) statement -> DEEP element .
$end reduce using rule 16 (statement -> DEEP element .)
state 8
(4) value -> ATOM .
(5) attribute -> ATOM . ATTRIB value
BASE16 reduce using rule 4 (value -> ATOM .)
BASE10 reduce using rule 4 (value -> ATOM .)
STRING reduce using rule 4 (value -> ATOM .)
ATOM reduce using rule 4 (value -> ATOM .)
$end reduce using rule 4 (value -> ATOM .)
ATTRIB shift and go to state 17
state 9
(12) element -> ATOM values .
(13) element -> ATOM values . attributes
(10) values -> values . value
(6) attributes -> . attributes attribute
(7) attributes -> . attribute
(1) value -> . BASE16
(2) value -> . BASE10
(3) value -> . STRING
(4) value -> . ATOM
(5) attribute -> . ATOM ATTRIB value
$end reduce using rule 12 (element -> ATOM values .)
BASE16 shift and go to state 13
BASE10 shift and go to state 14
STRING shift and go to state 15
ATOM shift and go to state 8
attributes shift and go to state 18
value shift and go to state 19
attribute shift and go to state 12
state 10
(14) element -> ATOM attributes .
(6) attributes -> attributes . attribute
(5) attribute -> . ATOM ATTRIB value
$end reduce using rule 14 (element -> ATOM attributes .)
ATOM shift and go to state 20
attribute shift and go to state 21
state 11
(11) values -> value .
BASE16 reduce using rule 11 (values -> value .)
BASE10 reduce using rule 11 (values -> value .)
STRING reduce using rule 11 (values -> value .)
ATOM reduce using rule 11 (values -> value .)
$end reduce using rule 11 (values -> value .)
state 12
(7) attributes -> attribute .
ATOM reduce using rule 7 (attributes -> attribute .)
$end reduce using rule 7 (attributes -> attribute .)
state 13
(1) value -> BASE16 .
BASE16 reduce using rule 1 (value -> BASE16 .)
BASE10 reduce using rule 1 (value -> BASE16 .)
STRING reduce using rule 1 (value -> BASE16 .)
ATOM reduce using rule 1 (value -> BASE16 .)
$end reduce using rule 1 (value -> BASE16 .)
state 14
(2) value -> BASE10 .
BASE16 reduce using rule 2 (value -> BASE10 .)
BASE10 reduce using rule 2 (value -> BASE10 .)
STRING reduce using rule 2 (value -> BASE10 .)
ATOM reduce using rule 2 (value -> BASE10 .)
$end reduce using rule 2 (value -> BASE10 .)
state 15
(3) value -> STRING .
BASE16 reduce using rule 3 (value -> STRING .)
BASE10 reduce using rule 3 (value -> STRING .)
STRING reduce using rule 3 (value -> STRING .)
ATOM reduce using rule 3 (value -> STRING .)
$end reduce using rule 3 (value -> STRING .)
state 16
(8) section -> SECTION ATOM .
(9) section -> SECTION ATOM . attributes
(6) attributes -> . attributes attribute
(7) attributes -> . attribute
(5) attribute -> . ATOM ATTRIB value
$end reduce using rule 8 (section -> SECTION ATOM .)
ATOM shift and go to state 20
attributes shift and go to state 22
attribute shift and go to state 12
state 17
(5) attribute -> ATOM ATTRIB . value
(1) value -> . BASE16
(2) value -> . BASE10
(3) value -> . STRING
(4) value -> . ATOM
BASE16 shift and go to state 13
BASE10 shift and go to state 14
STRING shift and go to state 15
ATOM shift and go to state 23
value shift and go to state 24
state 18
(13) element -> ATOM values attributes .
(6) attributes -> attributes . attribute
(5) attribute -> . ATOM ATTRIB value
$end reduce using rule 13 (element -> ATOM values attributes .)
ATOM shift and go to state 20
attribute shift and go to state 21
state 19
(10) values -> values value .
BASE16 reduce using rule 10 (values -> values value .)
BASE10 reduce using rule 10 (values -> values value .)
STRING reduce using rule 10 (values -> values value .)
ATOM reduce using rule 10 (values -> values value .)
$end reduce using rule 10 (values -> values value .)
state 20
(5) attribute -> ATOM . ATTRIB value
ATTRIB shift and go to state 17
state 21
(6) attributes -> attributes attribute .
ATOM reduce using rule 6 (attributes -> attributes attribute .)
$end reduce using rule 6 (attributes -> attributes attribute .)
state 22
(9) section -> SECTION ATOM attributes .
(6) attributes -> attributes . attribute
(5) attribute -> . ATOM ATTRIB value
$end reduce using rule 9 (section -> SECTION ATOM attributes .)
ATOM shift and go to state 20
attribute shift and go to state 21
state 23
(4) value -> ATOM .
ATOM reduce using rule 4 (value -> ATOM .)
$end reduce using rule 4 (value -> ATOM .)
state 24
(5) attribute -> ATOM ATTRIB value .
ATOM reduce using rule 5 (attribute -> ATOM ATTRIB value .)
$end reduce using rule 5 (attribute -> ATOM ATTRIB value .)

102
python/heck/parser.py Normal file
View File

@ -0,0 +1,102 @@
import ply.yacc as yacc
from lexer import tokens
def p_value(p):
"""
value : BASE16
| BASE10
| STRING
| ATOM
"""
#print(p[0], p[1])
p[0] = ("value", p[1])
def p_attribute(p):
"""attribute : ATOM ATTRIB value"""
# print(p[0], p[1])
p[0] = ("attribute", p[1], p[3])
def p_attributes(p):
"""
attributes : attributes attribute
attributes : attribute
"""
if len(p) == 2:
p[0] = ["attributes", p[1]]
else:
p[0] = p[1]
p[0].append(p[2])
def p_section(p):
"""
section : SECTION ATOM
| SECTION ATOM attributes
"""
if (len(p) == 3):
p[0] = ("section", p[2])
else:
p[0] = ("section", p[2], p[3])
def p_values(p):
"""
values : values value
values : value
"""
if len(p) == 2:
p[0] = ["values", p[1]]
else:
p[0] = p[1]
p[0].append(p[2])
def p_element(p):
"""
element : ATOM values
| ATOM values attributes
| ATOM attributes
"""
# print(len(p))
p[0] = ["element", p[1], p[2]]
if (len(p) == 4):
p[0].append(p[3])
def p_statement(p):
"""
statement : element
| DEEP element
| section
"""
if (len(p) > 2):
p[0] = ('deep', p[2])
else:
p[0] = p[1]
def p_error(p):
if not p:
return
else:
print("Syntax error {p}")
parser = yacc.yacc(start="statement")
TEST_STRING = [
'%%% heck',
'%%% heck foo=bar',
'%%% heck bar=-5l quux="hello! how are you today?" fred=69 barney=nice',
'title "My website!"',
'zoom 5.73',
'tags yo fresh',
'dumper 1 2 3 4 5 6 7 8 9 dumpped=True',
'> big_dumper 32 23 384848',
]
if __name__ == "__main__":
for test in TEST_STRING:
print(parser.parse(test))

47
python/heck/parsetab.py Normal file
View File

@ -0,0 +1,47 @@
# parsetab.py
# This file is automatically generated. Do not edit.
# pylint: disable=W,C,R
_tabversion = '3.10'
_lr_method = 'LALR'
_lr_signature = 'statementATOM ATTRIB BASE10 BASE16 COMMENT DEEP SECTION STRING\n value : BASE16\n | BASE10\n | STRING\n | ATOM\n attribute : ATOM ATTRIB value\n attributes : attributes attribute\n attributes : attribute\n \n section : SECTION ATOM\n | SECTION ATOM attributes\n \n values : values value\n values : value\n \n element : ATOM values\n | ATOM values attributes\n | ATOM attributes\n \n statement : element\n | DEEP element\n | section\n '
_lr_action_items = {'DEEP':([0,],[3,]),'ATOM':([0,3,5,6,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,],[5,5,8,16,-4,8,20,-11,-7,-1,-2,-3,20,23,20,-10,-6,20,-4,-5,]),'SECTION':([0,],[6,]),'$end':([1,2,4,7,8,9,10,11,12,13,14,15,16,18,19,21,22,23,24,],[0,-15,-17,-16,-4,-12,-14,-11,-7,-1,-2,-3,-8,-13,-10,-6,-9,-4,-5,]),'BASE16':([5,8,9,11,13,14,15,17,19,],[13,-4,13,-11,-1,-2,-3,13,-10,]),'BASE10':([5,8,9,11,13,14,15,17,19,],[14,-4,14,-11,-1,-2,-3,14,-10,]),'STRING':([5,8,9,11,13,14,15,17,19,],[15,-4,15,-11,-1,-2,-3,15,-10,]),'ATTRIB':([8,20,],[17,17,]),}
_lr_action = {}
for _k, _v in _lr_action_items.items():
for _x,_y in zip(_v[0],_v[1]):
if not _x in _lr_action: _lr_action[_x] = {}
_lr_action[_x][_k] = _y
del _lr_action_items
_lr_goto_items = {'statement':([0,],[1,]),'element':([0,3,],[2,7,]),'section':([0,],[4,]),'values':([5,],[9,]),'attributes':([5,9,16,],[10,18,22,]),'value':([5,9,17,],[11,19,24,]),'attribute':([5,9,10,16,18,22,],[12,12,21,12,21,21,]),}
_lr_goto = {}
for _k, _v in _lr_goto_items.items():
for _x, _y in zip(_v[0], _v[1]):
if not _x in _lr_goto: _lr_goto[_x] = {}
_lr_goto[_x][_k] = _y
del _lr_goto_items
_lr_productions = [
("S' -> statement","S'",1,None,None,None),
('value -> BASE16','value',1,'p_value','parser.py',7),
('value -> BASE10','value',1,'p_value','parser.py',8),
('value -> STRING','value',1,'p_value','parser.py',9),
('value -> ATOM','value',1,'p_value','parser.py',10),
('attribute -> ATOM ATTRIB value','attribute',3,'p_attribute','parser.py',17),
('attributes -> attributes attribute','attributes',2,'p_attributes','parser.py',24),
('attributes -> attribute','attributes',1,'p_attributes','parser.py',25),
('section -> SECTION ATOM','section',2,'p_section','parser.py',36),
('section -> SECTION ATOM attributes','section',3,'p_section','parser.py',37),
('values -> values value','values',2,'p_values','parser.py',46),
('values -> value','values',1,'p_values','parser.py',47),
('element -> ATOM values','element',2,'p_element','parser.py',58),
('element -> ATOM values attributes','element',3,'p_element','parser.py',59),
('element -> ATOM attributes','element',2,'p_element','parser.py',60),
('statement -> element','statement',1,'p_statement','parser.py',70),
('statement -> DEEP element','statement',2,'p_statement','parser.py',71),
('statement -> section','statement',1,'p_statement','parser.py',72),
]