heckformat/python/heck/lexer.py

118 lines
3.3 KiB
Python
Raw Normal View History

import ply.lex as lex
"""
Lexical analyzer for HECKformat lines using PLY Lex.
"""
from .exceptions import HeckLexException
from typing import List, Optional
import string
tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')
# COMMENT ::= # .*$
# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?
# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
# BASE16NUMBER ::= 0x[0-9A-Fa-f]+
# NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
# STRING ::= "([^\"]*|(\\)|(\"))"
# VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
# VALUES ::= <VALUE>(\s+<VALUES>)?
# ATTRIBUTENAME ::= <ATOM>
# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
# SECTIONLABEL ::= <ATOM>
# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
# LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
t_ignore = string.whitespace
t_DEEP = r'^(>)+'
t_BASE16 = r'0x[0-9A-Fa-f]+'
t_SECTION = r'^%%%\s'
t_ATTRIB = '='
def t_ATOM(token: lex.LexToken):
r'[A-Za-z_$][A-Za-z0-9_.-]*'
if token.value in ('true', 'True'):
token.value = True
elif token.value in ('false', 'False'):
token.value = False
return token
def t_BASE10(token: lex.LexToken):
r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
# python numbers are Very Flexible so we ignore typespec
vstr = token.value
if vstr[-1] in 'FLUIDCfluidc':
vstr = vstr[:-1]
if '.' in vstr:
token.value = float(vstr)
else:
token.value = int(vstr)
return token
def t_COMMENT(token: lex.LexToken):
r'\#\s?.*$'
...
def t_STRING(token: lex.LexToken):
r'"[^"]*"'
token.value = token.value[1:-1] # substring to strip double quotes
return token
def t_error(token: lex.LexToken):
print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
print('... ' + token.value)
print(' ^')
# token.lexer.skip(1)
lexer = lex.lex()
def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]:
"""
Return a list of tokens for a particular HECKformat file line.
"""
lexer.lineno = lineno
try:
lexer.input(line)
tokens = []
while True:
tok = lexer.token()
if tok:
tokens.append(tok)
else:
break
return tokens
except lex.LexError as inst:
# fixme raise a HeckFormat exception
raise HeckLexException from inst
TEST_STRINGS = [
'"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
'1.23f',
'"hello world!" atom utehuteu tnhoeun_etuhenuoh',
'"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
'%%% heck',
'%%% markdown foo=bar',
'element 1.2 1.3 1.4 attrib="string value for attribute"',
'> element 5 4 3 2.5',
]
if __name__ == "__main__":
for idx, test in enumerate(TEST_STRINGS):
print(f"Line {idx}: '{test}'")
try:
for token in lex_line(test, idx):
print(' ' + str(token))
except Exception as inst:
print(f'Error in line.')