Commit utils move and the slow py dumper
This commit is contained in:
60
carkov/analyze/utils.py
Normal file
60
carkov/analyze/utils.py
Normal file
@ -0,0 +1,60 @@
|
||||
#
|
||||
# carkov markov chain library
|
||||
# © Copyright 2025 by Aldercone Studio <alderconestudio@gmail.com>
|
||||
# This is free software, see the included LICENSE for terms and conditions.
|
||||
#
|
||||
|
||||
# for now we'll use nltk but this is here to make it so we implement our own
|
||||
import nltk
|
||||
|
||||
try:
|
||||
nltk.sent_tokenize("foo bar")
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
nltk.download('punkt_tab')
|
||||
|
||||
|
||||
# this is surely incomplete, but good enough possibly.
|
||||
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
|
||||
WS=" \t"
|
||||
TERM_PUNC="!.?‽"
|
||||
|
||||
def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]:
|
||||
"""
|
||||
Segementize a corpus into a list of separate sentences.
|
||||
|
||||
Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output.
|
||||
|
||||
Punctuation only apply if there is a space before or after them.
|
||||
|
||||
"""
|
||||
# this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated
|
||||
# especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe
|
||||
# if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
|
||||
|
||||
# FIXME
|
||||
return nltk.sent_tokenize(corpus)
|
||||
|
||||
|
||||
def english_sentence_tokenize(sentence: str) -> list[str]:
|
||||
"""
|
||||
Tokenize a sentence with the following rules:
|
||||
|
||||
* each word may be separated by a space or a punctuation
|
||||
* punctuation is included as separate tokens
|
||||
|
||||
"""
|
||||
start = 0
|
||||
out = []
|
||||
for idx, chr in enumerate(sentence):
|
||||
if chr in PUNC:
|
||||
if (start != idx):
|
||||
out.append(sentence[start:idx])
|
||||
out.append(chr)
|
||||
start = idx+1
|
||||
if chr in WS:
|
||||
if (start != idx):
|
||||
out.append(sentence[start:idx])
|
||||
start = idx+1
|
||||
|
||||
return out
|
||||
43
carkov/pydumperslow.py
Normal file
43
carkov/pydumperslow.py
Normal file
@ -0,0 +1,43 @@
|
||||
#
|
||||
# carkov markov chain library
|
||||
# © Copyright 2026 by Aldercone Studio <alderconestudio@gmail.com>
|
||||
# This is free software, see the included LICENSE for terms and conditions.
|
||||
#
|
||||
|
||||
"""
|
||||
Serialize chain as a python structure (slower load time but more efficient compilation).
|
||||
"""
|
||||
|
||||
from io import TextIOBase
|
||||
from . import version
|
||||
from .chain import Chain
|
||||
|
||||
template = """
|
||||
# serialized from version {version}
|
||||
from carkov.chain import Chain
|
||||
from carkov.abstracts import NUMBER, TERMINAL, Abstract
|
||||
|
||||
DATA={data}
|
||||
|
||||
def get_chainer():
|
||||
chain = Chain({order}, "{analyzer}")
|
||||
chain.data = {{}}
|
||||
for chain_data in DATA:
|
||||
chain.data[chain_data[0]] = {{x[0]: x[1] for x in chain_data[1]}}
|
||||
return chain
|
||||
"""
|
||||
|
||||
|
||||
def dump_chainer(chain: Chain, outfile: TextIOBase):
|
||||
"""
|
||||
Serialize a chainer to an open IO stream
|
||||
|
||||
Arguments:
|
||||
chain: A Chain object
|
||||
outfile: An open IO stream in text mode that will be writen to
|
||||
"""
|
||||
outfile.write(template.format(version=version,
|
||||
order=chain.order,
|
||||
analyzer=chain.analyzer_class,
|
||||
data=repr(tuple([(item[0], tuple(item[1].items())) for item in chain.items()])).replace(")),", ")),\n")
|
||||
))
|
||||
Reference in New Issue
Block a user