import ply.lex as lex """ Lexical analyzer for HECKformat lines using PLY Lex. """ from .exceptions import HeckLexException from typing import List, Optional import string tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP') # COMMENT ::= # .*$ # ATOM ::= [A-Za-z_][A-Za-z0-9_-]? # BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])? # BASE16NUMBER ::= 0x[0-9A-Fa-f]+ # NUMBER ::= () # STRING ::= "([^\"]*|(\\)|(\"))" # VALUE ::= (||) # VALUES ::= (\s+)? # ATTRIBUTENAME ::= # ATTRIBUTE ::= = # ATTRIBUTES ::= (\s+)? # SECTIONLABEL ::= # SECTION ::= %%%\s+\s+ # ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]? # ELEMENT ::= \s+(|) # LINE ::= ^(((>)*) |
| ) (|$) t_ignore = string.whitespace t_DEEP = r'^(>)+' t_ATOM = r'[A-Za-z_$][A-Za-z0-9_.-]*' t_BASE16 = r'0x[0-9A-Fa-f]+' t_SECTION = r'^%%%\s' t_ATTRIB = '=' def t_BASE10(token: lex.LexToken): r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)' # python numbers are Very Flexible so we ignore typespec vstr = token.value if vstr[-1] in 'FLUIDCfluidc': vstr = vstr[:-1] if '.' in vstr: token.value = float(vstr) else: token.value = int(vstr) return token def t_COMMENT(token: lex.LexToken): r'\#\s?.*$' ... def t_STRING(token: lex.LexToken): r'"[^"]*"' token.value = token.value[1:-1] # substring to strip double quotes return token def t_error(token: lex.LexToken): print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.") print('... ' + token.value) print(' ^') # token.lexer.skip(1) lexer = lex.lex() def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]: """ Return a list of tokens for a particular HECKformat file line. """ lexer.lineno = lineno try: lexer.input(line) tokens = [] while True: tok = lexer.token() if tok: tokens.append(tok) else: break return tokens except lex.LexError as inst: # fixme raise a HeckFormat exception raise HeckLexException from inst TEST_STRINGS = [ '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo', '1.23f', '"hello world!" atom utehuteu tnhoeun_etuhenuoh', '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo', '%%% heck', '%%% markdown foo=bar', 'element 1.2 1.3 1.4 attrib="string value for attribute"', '> element 5 4 3 2.5', ] if __name__ == "__main__": for idx, test in enumerate(TEST_STRINGS): print(f"Line {idx}: '{test}'") try: for token in lex_line(test, idx): print(' ' + str(token)) except Exception as inst: print(f'Error in line.')