98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
import ply.lex as lex
|
|
|
|
from typing import List, Optional
|
|
|
|
import string
|
|
|
|
tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')
|
|
|
|
# COMMENT ::= # .*$
|
|
# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?
|
|
# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
|
|
# BASE16NUMBER ::= 0x[0-9A-Fa-f]+
|
|
# NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
|
|
# STRING ::= "([^\"]*|(\\)|(\"))"
|
|
# VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
|
|
# VALUES ::= <VALUE>(\s+<VALUES>)?
|
|
# ATTRIBUTENAME ::= <ATOM>
|
|
# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
|
|
# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
|
|
# SECTIONLABEL ::= <ATOM>
|
|
# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
|
|
# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
|
|
# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
|
|
# LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
|
|
|
|
|
|
t_ignore = string.whitespace
|
|
t_DEEP = r'^(>)+'
|
|
t_ATOM = r'[A-Za-z_$][A-Za-z0-9_.-]*'
|
|
t_BASE16 = r'0x[0-9A-Fa-f]+'
|
|
t_SECTION = r'^%%%\s'
|
|
t_ATTRIB = '='
|
|
|
|
def t_BASE10(token: lex.LexToken):
|
|
r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
|
|
# python numbers are Very Flexible so we ignore typespec
|
|
vstr = token.value
|
|
if vstr[-1] in 'FLUIDCfluidc':
|
|
vstr = vstr[:-1]
|
|
if '.' in vstr:
|
|
token.value = float(vstr)
|
|
else:
|
|
token.value = int(vstr)
|
|
return token
|
|
|
|
def t_COMMENT(token: lex.LexToken):
|
|
r'\#\s?.*$'
|
|
...
|
|
|
|
def t_STRING(token: lex.LexToken):
|
|
r'"[^"]*"'
|
|
token.value = token.value[1:-1] # substring to strip double quotes
|
|
return token
|
|
|
|
def t_error(token: lex.LexToken):
|
|
print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
|
|
print('... ' + token.value)
|
|
print(' ^')
|
|
# token.lexer.skip(1)
|
|
|
|
lexer = lex.lex()
|
|
|
|
def lex_line(line, lineno=0) -> Optional[List[lex.LexToken]]:
|
|
lexer.lineno = lineno
|
|
try:
|
|
lexer.input(line)
|
|
tokens = []
|
|
while True:
|
|
tok = lexer.token()
|
|
if tok:
|
|
tokens.append(tok)
|
|
else:
|
|
break
|
|
return tokens
|
|
except lex.LexError as inst:
|
|
# fixme raise a HeckFormat exception
|
|
raise inst
|
|
|
|
TEST_STRINGS = [
|
|
'"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
|
|
'1.23f',
|
|
'"hello world!" atom utehuteu tnhoeun_etuhenuoh',
|
|
'"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
|
|
'%%% heck',
|
|
'%%% markdown foo=bar',
|
|
'element 1.2 1.3 1.4 attrib="string value for attribute"',
|
|
'> element 5 4 3 2.5',
|
|
]
|
|
|
|
if __name__ == "__main__":
|
|
for idx, test in enumerate(TEST_STRINGS):
|
|
print(f"Line {idx}: '{test}'")
|
|
try:
|
|
for token in lex_line(test, idx):
|
|
print(' ' + str(token))
|
|
except Exception as inst:
|
|
print(f'Error in line.')
|