heckformat/python/heck/lexer.py

import ply.lex as lex

"""
Lexical analyzer for HECKformat lines using PLY Lex.
"""

from .exceptions import HeckLexException

from typing import List, Optional

import string

tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP', 'ELEMENT')

# COMMENT            ::= # .*$
# ATOM               ::= [A-Za-z_][A-Za-z0-9_-]?
# BASE10NUMBER       ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
# BASE16NUMBER       ::= 0x[0-9A-Fa-f]+
# NUMBER             ::= (<BASE10NUMBER|BASE16NUMBER>)
# STRING             ::= "([^\"]*|(\\)|(\"))"
# VALUE              ::= (<ATOM>|<STRING>|<NUMBER>)
# VALUES             ::= <VALUE>(\s+<VALUES>)?
# ATTRIBUTENAME      ::= <ATOM>
# ATTRIBUTE          ::= <ATTRIBUTENAME>=<VALUE>
# ATTRIBUTES         ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
# SECTIONLABEL       ::= <ATOM>
# SECTION            ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
# ELEMENTLABEL       ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
# ELEMENT            ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
# LINE               ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)


t_ignore = string.whitespace

t_DEEP = r'^(>)+'

t_BASE16 = r'0x[0-9A-Fa-f]+'
t_SECTION = r'^%%%\s'
t_ATTRIB = '='
t_ELEMENT = r'[A-Za-z_.][A-Za-z0-9.!@\$%^&*()_+/\\-]*'

def t_ATOM(token: lex.LexToken):
    r'[A-Za-z_$][A-Za-z0-9_.-]*'
    if token.value in ('true', 'True'):
        token.value = True
    elif token.value in ('false', 'False'):
        token.value = False
    return token


def t_BASE10(token: lex.LexToken):
    r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
    # python numbers are Very Flexible so we ignore typespec
    vstr = token.value
    if vstr[-1] in 'FLUIDCfluidc':
        vstr = vstr[:-1]
    if '.' in vstr:
        token.value = float(vstr)
    else:
        token.value = int(vstr)
    return token

def t_COMMENT(token: lex.LexToken):
    r'\#\s?.*$'
    ...

def t_STRING(token: lex.LexToken):
  r'"[^"]*"'
  token.value = token.value[1:-1] # substring to strip double quotes
  return token

def t_error(token: lex.LexToken):
    print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
    print('... ' + token.value)
    print('    ^')
    # token.lexer.skip(1)

lexer = lex.lex()

def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]:
    """
    Return a list of tokens for a particular HECKformat file line.

    """
    lexer.lineno = lineno
    try:
        lexer.input(line)
        tokens = []
        while True:
            tok = lexer.token()
            if tok:
                tokens.append(tok)
            else:
                break
        return tokens
    except lex.LexError as inst:
        # fixme raise a HeckFormat exception
        raise HeckLexException from inst

TEST_STRINGS = [
    '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
    '1.23f',
    '"hello world!" atom utehuteu tnhoeun_etuhenuoh',
    '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
    '%%% heck',
    '%%% markdown foo=bar',
    'element 1.2 1.3 1.4 attrib="string value for attribute"',
    '> element 5 4 3 2.5',
]

if __name__ == "__main__":
    for idx, test in enumerate(TEST_STRINGS):
        print(f"Line {idx}: '{test}'")
        try:
            for token in lex_line(test, idx):
                print(' ' + str(token))
        except Exception as inst:
            print(f'Error in line.')