carkov/carkov/analyze/utils.py

#
# carkov markov chain library
# © Copyright 2025 by Aldercone Studio <alderconestudio@gmail.com>
# This is free software, see the included LICENSE for terms and conditions.
#

# for now we'll use nltk but this is here to make it so we implement our own
import nltk

try:
    nltk.sent_tokenize("foo bar")
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')


# this is surely incomplete, but good enough possibly.
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
WS=" \t"
TERM_PUNC="!.?‽"

def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]:
    """
    Segementize a corpus into a list of separate sentences.

    Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output.

    Punctuation only apply if there is a space before or after them.

    """
    # this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated
    # especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe
    # if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.

    # FIXME
    return nltk.sent_tokenize(corpus)


def english_sentence_tokenize(sentence: str) -> list[str]:
    """
    Tokenize a sentence with the following rules:

    * each word may be separated by a space or a punctuation
    * punctuation is included as separate tokens

    """
    start = 0
    out = []
    for idx, chr in enumerate(sentence):
        if chr in PUNC:
            if (start != idx):
                out.append(sentence[start:idx])
            out.append(chr)
            start = idx+1
        if chr in WS:
            if (start != idx):
                out.append(sentence[start:idx])
            start = idx+1

    return out