Commit utils move and the slow py dumper

This commit is contained in:
2026-03-24 19:33:51 -07:00
parent 3fae360c2e
commit 10195658e6
2 changed files with 103 additions and 0 deletions

60
carkov/analyze/utils.py Normal file
View File

@ -0,0 +1,60 @@
#
# carkov markov chain library
# © Copyright 2025 by Aldercone Studio <alderconestudio@gmail.com>
# This is free software, see the included LICENSE for terms and conditions.
#
# for now we'll use nltk but this is here to make it so we implement our own
import nltk
try:
nltk.sent_tokenize("foo bar")
except LookupError:
nltk.download('punkt')
nltk.download('punkt_tab')
# this is surely incomplete, but good enough possibly.
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
WS=" \t"
TERM_PUNC="!.?‽"
def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]:
"""
Segementize a corpus into a list of separate sentences.
Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output.
Punctuation only apply if there is a space before or after them.
"""
# this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated
# especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe
# if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
# FIXME
return nltk.sent_tokenize(corpus)
def english_sentence_tokenize(sentence: str) -> list[str]:
"""
Tokenize a sentence with the following rules:
* each word may be separated by a space or a punctuation
* punctuation is included as separate tokens
"""
start = 0
out = []
for idx, chr in enumerate(sentence):
if chr in PUNC:
if (start != idx):
out.append(sentence[start:idx])
out.append(chr)
start = idx+1
if chr in WS:
if (start != idx):
out.append(sentence[start:idx])
start = idx+1
return out

43
carkov/pydumperslow.py Normal file
View File

@ -0,0 +1,43 @@
#
# carkov markov chain library
# © Copyright 2026 by Aldercone Studio <alderconestudio@gmail.com>
# This is free software, see the included LICENSE for terms and conditions.
#
"""
Serialize chain as a python structure (slower load time but more efficient compilation).
"""
from io import TextIOBase
from . import version
from .chain import Chain
template = """
# serialized from version {version}
from carkov.chain import Chain
from carkov.abstracts import NUMBER, TERMINAL, Abstract
DATA={data}
def get_chainer():
chain = Chain({order}, "{analyzer}")
chain.data = {{}}
for chain_data in DATA:
chain.data[chain_data[0]] = {{x[0]: x[1] for x in chain_data[1]}}
return chain
"""
def dump_chainer(chain: Chain, outfile: TextIOBase):
"""
Serialize a chainer to an open IO stream
Arguments:
chain: A Chain object
outfile: An open IO stream in text mode that will be writen to
"""
outfile.write(template.format(version=version,
order=chain.order,
analyzer=chain.analyzer_class,
data=repr(tuple([(item[0], tuple(item[1].items())) for item in chain.items()])).replace(")),", ")),\n")
))