From 10195658e6c1a1f4d9d32d597c27acb66bb9112f Mon Sep 17 00:00:00 2001 From: Cassowary Date: Tue, 24 Mar 2026 19:33:51 -0700 Subject: [PATCH] Commit utils move and the slow py dumper --- carkov/analyze/utils.py | 60 +++++++++++++++++++++++++++++++++++++++++ carkov/pydumperslow.py | 43 +++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 carkov/analyze/utils.py create mode 100644 carkov/pydumperslow.py diff --git a/carkov/analyze/utils.py b/carkov/analyze/utils.py new file mode 100644 index 0000000..9298f96 --- /dev/null +++ b/carkov/analyze/utils.py @@ -0,0 +1,60 @@ +# +# carkov markov chain library +# © Copyright 2025 by Aldercone Studio +# This is free software, see the included LICENSE for terms and conditions. +# + +# for now we'll use nltk but this is here to make it so we implement our own +import nltk + +try: + nltk.sent_tokenize("foo bar") +except LookupError: + nltk.download('punkt') + nltk.download('punkt_tab') + + +# this is surely incomplete, but good enough possibly. +PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$" +WS=" \t" +TERM_PUNC="!.?‽" + +def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]: + """ + Segementize a corpus into a list of separate sentences. + + Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output. + + Punctuation only apply if there is a space before or after them. + + """ + # this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated + # especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe + # if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it. + + # FIXME + return nltk.sent_tokenize(corpus) + + +def english_sentence_tokenize(sentence: str) -> list[str]: + """ + Tokenize a sentence with the following rules: + + * each word may be separated by a space or a punctuation + * punctuation is included as separate tokens + + """ + start = 0 + out = [] + for idx, chr in enumerate(sentence): + if chr in PUNC: + if (start != idx): + out.append(sentence[start:idx]) + out.append(chr) + start = idx+1 + if chr in WS: + if (start != idx): + out.append(sentence[start:idx]) + start = idx+1 + + return out diff --git a/carkov/pydumperslow.py b/carkov/pydumperslow.py new file mode 100644 index 0000000..08cc844 --- /dev/null +++ b/carkov/pydumperslow.py @@ -0,0 +1,43 @@ +# +# carkov markov chain library +# © Copyright 2026 by Aldercone Studio +# This is free software, see the included LICENSE for terms and conditions. +# + +""" +Serialize chain as a python structure (slower load time but more efficient compilation). +""" + +from io import TextIOBase +from . import version +from .chain import Chain + +template = """ +# serialized from version {version} +from carkov.chain import Chain +from carkov.abstracts import NUMBER, TERMINAL, Abstract + +DATA={data} + +def get_chainer(): + chain = Chain({order}, "{analyzer}") + chain.data = {{}} + for chain_data in DATA: + chain.data[chain_data[0]] = {{x[0]: x[1] for x in chain_data[1]}} + return chain +""" + + +def dump_chainer(chain: Chain, outfile: TextIOBase): + """ + Serialize a chainer to an open IO stream + + Arguments: + chain: A Chain object + outfile: An open IO stream in text mode that will be writen to + """ + outfile.write(template.format(version=version, + order=chain.order, + analyzer=chain.analyzer_class, + data=repr(tuple([(item[0], tuple(item[1].items())) for item in chain.items()])).replace(")),", ")),\n") + ))