Reorganize project layout. Add buildability.
This commit is contained in:
119
python/heckformat/lexer.py
Normal file
119
python/heckformat/lexer.py
Normal file
@ -0,0 +1,119 @@
|
||||
import ply.lex as lex
|
||||
|
||||
"""
|
||||
Lexical analyzer for HECKformat lines using PLY Lex.
|
||||
"""
|
||||
|
||||
from .exceptions import HeckLexException
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
import string
|
||||
|
||||
tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP', 'ELEMENT')
|
||||
|
||||
# COMMENT ::= # .*$
|
||||
# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?
|
||||
# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
|
||||
# BASE16NUMBER ::= 0x[0-9A-Fa-f]+
|
||||
# NUMBER ::= (<BASE10NUMBER|BASE16NUMBER>)
|
||||
# STRING ::= "([^\"]*|(\\)|(\"))"
|
||||
# VALUE ::= (<ATOM>|<STRING>|<NUMBER>)
|
||||
# VALUES ::= <VALUE>(\s+<VALUES>)?
|
||||
# ATTRIBUTENAME ::= <ATOM>
|
||||
# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>
|
||||
# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
|
||||
# SECTIONLABEL ::= <ATOM>
|
||||
# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
|
||||
# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
|
||||
# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
|
||||
# LINE ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
|
||||
|
||||
|
||||
t_ignore = string.whitespace
|
||||
|
||||
t_DEEP = r'^(>)+'
|
||||
|
||||
t_BASE16 = r'0x[0-9A-Fa-f]+'
|
||||
t_SECTION = r'^%%%\s'
|
||||
t_ATTRIB = '='
|
||||
t_ELEMENT = r'[A-Za-z_.][A-Za-z0-9.!@\$%^&*()_+/\\-]*'
|
||||
|
||||
def t_ATOM(token: lex.LexToken):
|
||||
r'[A-Za-z_$][A-Za-z0-9_.-]*'
|
||||
if token.value in ('true', 'True'):
|
||||
token.value = True
|
||||
elif token.value in ('false', 'False'):
|
||||
token.value = False
|
||||
return token
|
||||
|
||||
|
||||
|
||||
def t_BASE10(token: lex.LexToken):
|
||||
r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
|
||||
# python numbers are Very Flexible so we ignore typespec
|
||||
vstr = token.value
|
||||
if vstr[-1] in 'FLUIDCfluidc':
|
||||
vstr = vstr[:-1]
|
||||
if '.' in vstr:
|
||||
token.value = float(vstr)
|
||||
else:
|
||||
token.value = int(vstr)
|
||||
return token
|
||||
|
||||
def t_COMMENT(token: lex.LexToken):
|
||||
r'\#\s?.*$'
|
||||
...
|
||||
|
||||
def t_STRING(token: lex.LexToken):
|
||||
r'"[^"]*"'
|
||||
token.value = token.value[1:-1] # substring to strip double quotes
|
||||
return token
|
||||
|
||||
def t_error(token: lex.LexToken):
|
||||
print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
|
||||
print('... ' + token.value)
|
||||
print(' ^')
|
||||
# token.lexer.skip(1)
|
||||
|
||||
lexer = lex.lex()
|
||||
|
||||
def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]:
|
||||
"""
|
||||
Return a list of tokens for a particular HECKformat file line.
|
||||
|
||||
"""
|
||||
lexer.lineno = lineno
|
||||
try:
|
||||
lexer.input(line)
|
||||
tokens = []
|
||||
while True:
|
||||
tok = lexer.token()
|
||||
if tok:
|
||||
tokens.append(tok)
|
||||
else:
|
||||
break
|
||||
return tokens
|
||||
except lex.LexError as inst:
|
||||
# fixme raise a HeckFormat exception
|
||||
raise HeckLexException from inst
|
||||
|
||||
TEST_STRINGS = [
|
||||
'"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
|
||||
'1.23f',
|
||||
'"hello world!" atom utehuteu tnhoeun_etuhenuoh',
|
||||
'"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
|
||||
'%%% heck',
|
||||
'%%% markdown foo=bar',
|
||||
'element 1.2 1.3 1.4 attrib="string value for attribute"',
|
||||
'> element 5 4 3 2.5',
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
for idx, test in enumerate(TEST_STRINGS):
|
||||
print(f"Line {idx}: '{test}'")
|
||||
try:
|
||||
for token in lex_line(test, idx):
|
||||
print(' ' + str(token))
|
||||
except Exception as inst:
|
||||
print(f'Error in line.')
|
||||
Reference in New Issue
Block a user