Files
carkov/carkov/analyze/utils.py

61 lines
1.9 KiB
Python

#
# carkov markov chain library
# © Copyright 2025 by Aldercone Studio <alderconestudio@gmail.com>
# This is free software, see the included LICENSE for terms and conditions.
#
# for now we'll use nltk but this is here to make it so we implement our own
import nltk
try:
nltk.sent_tokenize("foo bar")
except LookupError:
nltk.download('punkt')
nltk.download('punkt_tab')
# this is surely incomplete, but good enough possibly.
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
WS=" \t"
TERM_PUNC="!.?‽"
def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]:
"""
Segementize a corpus into a list of separate sentences.
Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output.
Punctuation only apply if there is a space before or after them.
"""
# this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated
# especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe
# if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
# FIXME
return nltk.sent_tokenize(corpus)
def english_sentence_tokenize(sentence: str) -> list[str]:
"""
Tokenize a sentence with the following rules:
* each word may be separated by a space or a punctuation
* punctuation is included as separate tokens
"""
start = 0
out = []
for idx, chr in enumerate(sentence):
if chr in PUNC:
if (start != idx):
out.append(sentence[start:idx])
out.append(chr)
start = idx+1
if chr in WS:
if (start != idx):
out.append(sentence[start:idx])
start = idx+1
return out