Reorganize project layout. Add buildability.

2024-02-08 21:14:00 -08:00
parent 49ddb152ec
commit 7c98e6e895
7 changed files with 504 additions and 0 deletions
--- a/python/heckformat/lexer.py
+++ b/python/heckformat/lexer.py
@ -0,0 +1,119 @@
+import ply.lex as lex
+
+"""
+Lexical analyzer for HECKformat lines using PLY Lex.
+"""
+
+from .exceptions import HeckLexException
+
+from typing import List, Optional
+
+import string
+
+tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP', 'ELEMENT')
+
+# COMMENT            ::= # .*$
+# ATOM               ::= [A-Za-z_][A-Za-z0-9_-]?
+# BASE10NUMBER       ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
+# BASE16NUMBER       ::= 0x[0-9A-Fa-f]+
+# NUMBER             ::= (<BASE10NUMBER|BASE16NUMBER>)
+# STRING             ::= "([^\"]*|(\\)|(\"))"
+# VALUE              ::= (<ATOM>|<STRING>|<NUMBER>)
+# VALUES             ::= <VALUE>(\s+<VALUES>)?
+# ATTRIBUTENAME      ::= <ATOM>
+# ATTRIBUTE          ::= <ATTRIBUTENAME>=<VALUE>
+# ATTRIBUTES         ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
+# SECTIONLABEL       ::= <ATOM>
+# SECTION            ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
+# ELEMENTLABEL       ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
+# ELEMENT            ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
+# LINE               ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
+
+
+t_ignore = string.whitespace
+
+t_DEEP = r'^(>)+'
+
+t_BASE16 = r'0x[0-9A-Fa-f]+'
+t_SECTION = r'^%%%\s'
+t_ATTRIB = '='
+t_ELEMENT = r'[A-Za-z_.][A-Za-z0-9.!@\$%^&*()_+/\\-]*'
+
+def t_ATOM(token: lex.LexToken):
+    r'[A-Za-z_$][A-Za-z0-9_.-]*'
+    if token.value in ('true', 'True'):
+        token.value = True
+    elif token.value in ('false', 'False'):
+        token.value = False
+    return token
+
+
+
+def t_BASE10(token: lex.LexToken):
+    r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
+    # python numbers are Very Flexible so we ignore typespec
+    vstr = token.value
+    if vstr[-1] in 'FLUIDCfluidc':
+        vstr = vstr[:-1]
+    if '.' in vstr:
+        token.value = float(vstr)
+    else:
+        token.value = int(vstr)
+    return token
+
+def t_COMMENT(token: lex.LexToken):
+    r'\#\s?.*$'
+    ...
+
+def t_STRING(token: lex.LexToken):
+  r'"[^"]*"'
+  token.value = token.value[1:-1] # substring to strip double quotes
+  return token
+
+def t_error(token: lex.LexToken):
+    print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
+    print('... ' + token.value)
+    print('    ^')
+    # token.lexer.skip(1)
+
+lexer = lex.lex()
+
+def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]:
+    """
+    Return a list of tokens for a particular HECKformat file line.
+
+    """
+    lexer.lineno = lineno
+    try:
+        lexer.input(line)
+        tokens = []
+        while True:
+            tok = lexer.token()
+            if tok:
+                tokens.append(tok)
+            else:
+                break
+        return tokens
+    except lex.LexError as inst:
+        # fixme raise a HeckFormat exception
+        raise HeckLexException from inst
+
+TEST_STRINGS = [
+    '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
+    '1.23f',
+    '"hello world!" atom utehuteu tnhoeun_etuhenuoh',
+    '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
+    '%%% heck',
+    '%%% markdown foo=bar',
+    'element 1.2 1.3 1.4 attrib="string value for attribute"',
+    '> element 5 4 3 2.5',
+]
+
+if __name__ == "__main__":
+    for idx, test in enumerate(TEST_STRINGS):
+        print(f"Line {idx}: '{test}'")
+        try:
+            for token in lex_line(test, idx):
+                print(' ' + str(token))
+        except Exception as inst:
+            print(f'Error in line.')