Commit utils move and the slow py dumper

2026-03-24 19:33:51 -07:00
parent 3fae360c2e
commit 10195658e6
2 changed files with 103 additions and 0 deletions
--- a/carkov/analyze/utils.py
+++ b/carkov/analyze/utils.py
@ -0,0 +1,60 @@
+#
+# carkov markov chain library
+# © Copyright 2025 by Aldercone Studio <alderconestudio@gmail.com>
+# This is free software, see the included LICENSE for terms and conditions.
+#
+
+# for now we'll use nltk but this is here to make it so we implement our own
+import nltk
+
+try:
+    nltk.sent_tokenize("foo bar")
+except LookupError:
+    nltk.download('punkt')
+    nltk.download('punkt_tab')
+
+
+# this is surely incomplete, but good enough possibly.
+PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
+WS=" \t"
+TERM_PUNC="!.?‽"
+
+def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]:
+    """
+    Segementize a corpus into a list of separate sentences.
+
+    Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output.
+
+    Punctuation only apply if there is a space before or after them.
+
+    """
+    # this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated
+    # especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe
+    # if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
+
+    # FIXME
+    return nltk.sent_tokenize(corpus)
+
+
+def english_sentence_tokenize(sentence: str) -> list[str]:
+    """
+    Tokenize a sentence with the following rules:
+
+    * each word may be separated by a space or a punctuation
+    * punctuation is included as separate tokens
+
+    """
+    start = 0
+    out = []
+    for idx, chr in enumerate(sentence):
+        if chr in PUNC:
+            if (start != idx):
+                out.append(sentence[start:idx])
+            out.append(chr)
+            start = idx+1
+        if chr in WS:
+            if (start != idx):
+                out.append(sentence[start:idx])
+            start = idx+1
+
+    return out
--- a/carkov/pydumperslow.py
+++ b/carkov/pydumperslow.py
@ -0,0 +1,43 @@
+#
+# carkov markov chain library
+# © Copyright 2026 by Aldercone Studio <alderconestudio@gmail.com>
+# This is free software, see the included LICENSE for terms and conditions.
+#
+
+"""
+Serialize chain as a python structure (slower load time but more efficient compilation).
+"""
+
+from io import TextIOBase
+from . import version
+from .chain import Chain
+
+template = """
+# serialized from version {version}
+from carkov.chain import Chain
+from carkov.abstracts import NUMBER, TERMINAL, Abstract
+
+DATA={data}
+
+def get_chainer():
+    chain = Chain({order}, "{analyzer}")
+    chain.data = {{}}
+    for chain_data in DATA:
+       chain.data[chain_data[0]] = {{x[0]: x[1] for x in chain_data[1]}}
+    return chain
+"""
+
+
+def dump_chainer(chain: Chain, outfile: TextIOBase):
+    """
+    Serialize a chainer to an open IO stream
+
+    Arguments:
+        chain: A Chain object
+        outfile: An open IO stream in text mode that will be writen to
+    """
+    outfile.write(template.format(version=version,
+                                  order=chain.order,
+                                  analyzer=chain.analyzer_class,
+                                  data=repr(tuple([(item[0], tuple(item[1].items())) for item in chain.items()])).replace(")),", ")),\n")
+                                  ))