heckformat/python/heck/lexer.py

98 lines
2.9 KiB
Python
Raw Normal View History

import ply.lex as lex
from typing import List, Optional
import string
tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')
# COMMENT ::= # .*$
# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?
# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
# BASE16NUMBER ::= 0x[0-9A-Fa-f]+
# NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
# STRING ::= "([^\"]*|(\\)|(\"))"
# VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
# VALUES ::= <VALUE>(\s+<VALUES>)?
# ATTRIBUTENAME ::= <ATOM>
# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
# SECTIONLABEL ::= <ATOM>
# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
# LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
t_ignore = string.whitespace
t_DEEP = r'^(>)+'
t_ATOM = r'[A-Za-z_$][A-Za-z0-9_.-]*'
t_BASE16 = r'0x[0-9A-Fa-f]+'
t_SECTION = r'^%%%\s'
t_ATTRIB = '='
def t_BASE10(token: lex.LexToken):
r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
# python numbers are Very Flexible so we ignore typespec
vstr = token.value
if vstr[-1] in 'FLUIDCfluidc':
vstr = vstr[:-1]
if '.' in vstr:
token.value = float(vstr)
else:
token.value = int(vstr)
return token
def t_COMMENT(token: lex.LexToken):
r'\#\s?.*$'
...
def t_STRING(token: lex.LexToken):
r'"[^"]*"'
token.value = token.value[1:-1] # substring to strip double quotes
return token
def t_error(token: lex.LexToken):
print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
print('... ' + token.value)
print(' ^')
# token.lexer.skip(1)
lexer = lex.lex()
def lex_line(line, lineno=0) -> Optional[List[lex.LexToken]]:
lexer.lineno = lineno
try:
lexer.input(line)
tokens = []
while True:
tok = lexer.token()
if tok:
tokens.append(tok)
else:
break
return tokens
except lex.LexError as inst:
# fixme raise a HeckFormat exception
raise inst
TEST_STRINGS = [
'"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
'1.23f',
'"hello world!" atom utehuteu tnhoeun_etuhenuoh',
'"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
'%%% heck',
'%%% markdown foo=bar',
'element 1.2 1.3 1.4 attrib="string value for attribute"',
'> element 5 4 3 2.5',
]
if __name__ == "__main__":
for idx, test in enumerate(TEST_STRINGS):
print(f"Line {idx}: '{test}'")
try:
for token in lex_line(test, idx):
print(' ' + str(token))
except Exception as inst:
print(f'Error in line.')