heckformat/python/heck/lexer.py

import ply.lex as lex

"""
Lexical analyzer for HECKformat lines using PLY Lex.
"""

from .exceptions import HeckLexException

from typing import List, Optional

import string

tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')

# COMMENT            ::= # .*$
# ATOM               ::= [A-Za-z_][A-Za-z0-9_-]?
# BASE10NUMBER       ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
# BASE16NUMBER       ::= 0x[0-9A-Fa-f]+
# NUMBER             ::= (<BASE10NUMBER|BASE16NUMBER>)
# STRING             ::= "([^\"]*|(\\)|(\"))"
# VALUE              ::= (<ATOM>|<STRING>|<NUMBER>)
# VALUES             ::= <VALUE>(\s+<VALUES>)?
# ATTRIBUTENAME      ::= <ATOM>
# ATTRIBUTE          ::= <ATTRIBUTENAME>=<VALUE>
# ATTRIBUTES         ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
# SECTIONLABEL       ::= <ATOM>
# SECTION            ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
# ELEMENTLABEL       ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
# ELEMENT            ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
# LINE               ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)


t_ignore = string.whitespace

t_DEEP = r'^(>)+'

t_BASE16 = r'0x[0-9A-Fa-f]+'
t_SECTION = r'^%%%\s'
t_ATTRIB = '='


def t_ATOM(token: lex.LexToken):
    r'[A-Za-z_$][A-Za-z0-9_.-]*'
    if token.value in ('true', 'True'):
        token.value = True
    elif token.value in ('false', 'False'):
        token.value = False
    return token

def t_BASE10(token: lex.LexToken):
    r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
    # python numbers are Very Flexible so we ignore typespec
    vstr = token.value
    if vstr[-1] in 'FLUIDCfluidc':
        vstr = vstr[:-1]
    if '.' in vstr:
        token.value = float(vstr)
    else:
        token.value = int(vstr)
    return token

def t_COMMENT(token: lex.LexToken):
    r'\#\s?.*$'
    ...

def t_STRING(token: lex.LexToken):
  r'"[^"]*"'
  token.value = token.value[1:-1] # substring to strip double quotes
  return token

def t_error(token: lex.LexToken):
    print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
    print('... ' + token.value)
    print('    ^')
    # token.lexer.skip(1)

lexer = lex.lex()

def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]:
    """
    Return a list of tokens for a particular HECKformat file line.

    """
    lexer.lineno = lineno
    try:
        lexer.input(line)
        tokens = []
        while True:
            tok = lexer.token()
            if tok:
                tokens.append(tok)
            else:
                break
        return tokens
    except lex.LexError as inst:
        # fixme raise a HeckFormat exception
        raise HeckLexException from inst

TEST_STRINGS = [
    '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
    '1.23f',
    '"hello world!" atom utehuteu tnhoeun_etuhenuoh',
    '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
    '%%% heck',
    '%%% markdown foo=bar',
    'element 1.2 1.3 1.4 attrib="string value for attribute"',
    '> element 5 4 3 2.5',
]

if __name__ == "__main__":
    for idx, test in enumerate(TEST_STRINGS):
        print(f"Line {idx}: '{test}'")
        try:
            for token in lex_line(test, idx):
                print(' ' + str(token))
        except Exception as inst:
            print(f'Error in line.')
Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00			`import ply.lex as lex`

Organize and clean up Python implementation. 2024-01-31 18:13:52 +01:00			`"""`
			`Lexical analyzer for HECKformat lines using PLY Lex.`
			`"""`

			`from .exceptions import HeckLexException`

Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00			`from typing import List, Optional`

			`import string`

			`tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')`

			`# COMMENT ::= # .*$`
			`# ATOM ::= [A-Za-z_][A-Za-z0-9_-]?`
			`# BASE10NUMBER ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?`
			`# BASE16NUMBER ::= 0x[0-9A-Fa-f]+`
			`# NUMBER ::= (<BASE10NUMBER\|BASE16NUMBER>)`
			`# STRING ::= "([^\"]*\|(\\)\|(\"))"`
			`# VALUE ::= (<ATOM>\|<STRING>\|<NUMBER>)`
			`# VALUES ::= <VALUE>(\s+<VALUES>)?`
			`# ATTRIBUTENAME ::= <ATOM>`
			`# ATTRIBUTE ::= <ATTRIBUTENAME>=<VALUE>`
			`# ATTRIBUTES ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?`
			`# SECTIONLABEL ::= <ATOM>`
			`# SECTION ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>`
			`# ELEMENTLABEL ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?`
			`# ELEMENT ::= <ELEMENTLABEL>\s+(<VALUES>\|<ATTRIBUTES>)`
			`# LINE ::= ^(((>)*<ELEMENT>) \| <SECTION> \| <COMMENT>) (<COMMENT>\|$)`


			`t_ignore = string.whitespace`
Organize and clean up Python implementation. 2024-01-31 18:13:52 +01:00
Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00			`t_DEEP = r'^(>)+'`
Implement subelements. Add allowing valueless elements. Start to design the native structure interface. 2024-02-03 18:42:20 +01:00
Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00			`t_BASE16 = r'0x[0-9A-Fa-f]+'`
			`t_SECTION = r'^%%%\s'`
			`t_ATTRIB = '='`

Implement subelements. Add allowing valueless elements. Start to design the native structure interface. 2024-02-03 18:42:20 +01:00
			`def t_ATOM(token: lex.LexToken):`
			`r'[A-Za-z_$][A-Za-z0-9_.-]*'`
			`if token.value in ('true', 'True'):`
			`token.value = True`
			`elif token.value in ('false', 'False'):`
			`token.value = False`
			`return token`

Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00			`def t_BASE10(token: lex.LexToken):`
			`r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b\|$)'`
			`# python numbers are Very Flexible so we ignore typespec`
			`vstr = token.value`
			`if vstr[-1] in 'FLUIDCfluidc':`
			`vstr = vstr[:-1]`
			`if '.' in vstr:`
			`token.value = float(vstr)`
			`else:`
			`token.value = int(vstr)`
			`return token`

			`def t_COMMENT(token: lex.LexToken):`
			`r'\#\s?.*$'`
			`...`

			`def t_STRING(token: lex.LexToken):`
			`r'"[^"]*"'`
			`token.value = token.value[1:-1] # substring to strip double quotes`
			`return token`

			`def t_error(token: lex.LexToken):`
			`print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")`
			`print('... ' + token.value)`
			`print(' ^')`
			`# token.lexer.skip(1)`

			`lexer = lex.lex()`

Organize and clean up Python implementation. 2024-01-31 18:13:52 +01:00			`def lex_line(line: str, lineno: int=0) -> List[lex.LexToken]:`
			`"""`
			`Return a list of tokens for a particular HECKformat file line.`

			`"""`
Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00			`lexer.lineno = lineno`
			`try:`
			`lexer.input(line)`
			`tokens = []`
			`while True:`
			`tok = lexer.token()`
			`if tok:`
			`tokens.append(tok)`
			`else:`
			`break`
			`return tokens`
			`except lex.LexError as inst:`
			`# fixme raise a HeckFormat exception`
Organize and clean up Python implementation. 2024-01-31 18:13:52 +01:00			`raise HeckLexException from inst`
Initial checkin of Python parser for heckfiles. 2024-01-31 17:30:20 +01:00
			`TEST_STRINGS = [`
			`'"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',`
			`'1.23f',`
			`'"hello world!" atom utehuteu tnhoeun_etuhenuoh',`
			`'"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',`
			`'%%% heck',`
			`'%%% markdown foo=bar',`
			`'element 1.2 1.3 1.4 attrib="string value for attribute"',`
			`'> element 5 4 3 2.5',`
			`]`

			`if __name__ == "__main__":`
			`for idx, test in enumerate(TEST_STRINGS):`
			`print(f"Line {idx}: '{test}'")`
			`try:`
			`for token in lex_line(test, idx):`
			`print(' ' + str(token))`
			`except Exception as inst:`
			`print(f'Error in line.')`