98 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			98 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import ply.lex as lex
 | |
| 
 | |
| from typing import List, Optional
 | |
| 
 | |
| import string
 | |
| 
 | |
| tokens = ('ATOM', 'BASE10', 'BASE16', 'COMMENT', 'STRING', 'SECTION', 'ATTRIB', 'DEEP')
 | |
| 
 | |
| # COMMENT            ::= # .*$
 | |
| # ATOM               ::= [A-Za-z_][A-Za-z0-9_-]?
 | |
| # BASE10NUMBER       ::= (-)?[0-9]+(\.)?[0-9]+([FLUIDCfluidc])?
 | |
| # BASE16NUMBER       ::= 0x[0-9A-Fa-f]+
 | |
| # NUMBER             ::= (<BASE10NUMBER|BASE16NUMBER>)
 | |
| # STRING             ::= "([^\"]*|(\\)|(\"))"
 | |
| # VALUE              ::= (<ATOM>|<STRING>|<NUMBER>)
 | |
| # VALUES             ::= <VALUE>(\s+<VALUES>)?
 | |
| # ATTRIBUTENAME      ::= <ATOM>
 | |
| # ATTRIBUTE          ::= <ATTRIBUTENAME>=<VALUE>
 | |
| # ATTRIBUTES         ::= <ATTRIBUTE>(\s+<ATTRIBUTES>)?
 | |
| # SECTIONLABEL       ::= <ATOM>
 | |
| # SECTION            ::= %%%\s+<SECTIONLABEL>\s+<ATTRIBUTES>
 | |
| # ELEMENTLABEL       ::= [A-Za-z_][A-Za-z0-9!@#$%^&*()_+/\\-]?
 | |
| # ELEMENT            ::= <ELEMENTLABEL>\s+(<VALUES>|<ATTRIBUTES>)
 | |
| # LINE               ::= ^(((>)*<ELEMENT>) | <SECTION> | <COMMENT>) (<COMMENT>|$)
 | |
| 
 | |
| 
 | |
| t_ignore = string.whitespace
 | |
| t_DEEP = r'^(>)+'
 | |
| t_ATOM = r'[A-Za-z_$][A-Za-z0-9_.-]*'
 | |
| t_BASE16 = r'0x[0-9A-Fa-f]+'
 | |
| t_SECTION = r'^%%%\s'
 | |
| t_ATTRIB = '='
 | |
| 
 | |
| def t_BASE10(token: lex.LexToken):
 | |
|     r'(-)?[0-9]+(\.?[0-9]+)?([FLUIDCfluidc])?(\b|$)'
 | |
|     # python numbers are Very Flexible so we ignore typespec
 | |
|     vstr = token.value
 | |
|     if vstr[-1] in 'FLUIDCfluidc':
 | |
|         vstr = vstr[:-1]
 | |
|     if '.' in vstr:
 | |
|         token.value = float(vstr)
 | |
|     else:
 | |
|         token.value = int(vstr)
 | |
|     return token
 | |
| 
 | |
| def t_COMMENT(token: lex.LexToken):
 | |
|     r'\#\s?.*$'
 | |
|     ...
 | |
| 
 | |
| def t_STRING(token: lex.LexToken):
 | |
|   r'"[^"]*"'
 | |
|   token.value = token.value[1:-1] # substring to strip double quotes
 | |
|   return token
 | |
| 
 | |
| def t_error(token: lex.LexToken):
 | |
|     print(f"{token.lineno} Unexpected character '{token.value[0]}' at position {token.lexpos}.")
 | |
|     print('... ' + token.value)
 | |
|     print('    ^')
 | |
|     # token.lexer.skip(1)
 | |
| 
 | |
| lexer = lex.lex()
 | |
| 
 | |
| def lex_line(line, lineno=0) -> Optional[List[lex.LexToken]]:
 | |
|     lexer.lineno = lineno
 | |
|     try:
 | |
|         lexer.input(line)
 | |
|         tokens = []
 | |
|         while True:
 | |
|             tok = lexer.token()
 | |
|             if tok:
 | |
|                 tokens.append(tok)
 | |
|             else:
 | |
|                 break
 | |
|         return tokens
 | |
|     except lex.LexError as inst:
 | |
|         # fixme raise a HeckFormat exception
 | |
|         raise inst
 | |
| 
 | |
| TEST_STRINGS = [
 | |
|     '"hi yo123 123xyz #foo" 123xyz 123.223 1f abcd123 123abc $foo "hello world" #foo',
 | |
|     '1.23f',
 | |
|     '"hello world!" atom utehuteu tnhoeun_etuhenuoh',
 | |
|     '"hi yo123 123xyz #foo" xyz 123.223 1f abcd123 abc $foo "hello world" #foo',
 | |
|     '%%% heck',
 | |
|     '%%% markdown foo=bar',
 | |
|     'element 1.2 1.3 1.4 attrib="string value for attribute"',
 | |
|     '> element 5 4 3 2.5',
 | |
| ]
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     for idx, test in enumerate(TEST_STRINGS):
 | |
|         print(f"Line {idx}: '{test}'")
 | |
|         try:
 | |
|             for token in lex_line(test, idx):
 | |
|                 print(' ' + str(token))
 | |
|         except Exception as inst:
 | |
|             print(f'Error in line.')
 |