61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
#
|
|
# carkov markov chain library
|
|
# © Copyright 2025 by Aldercone Studio <alderconestudio@gmail.com>
|
|
# This is free software, see the included LICENSE for terms and conditions.
|
|
#
|
|
|
|
# for now we'll use nltk but this is here to make it so we implement our own
|
|
import nltk
|
|
|
|
try:
|
|
nltk.sent_tokenize("foo bar")
|
|
except LookupError:
|
|
nltk.download('punkt')
|
|
nltk.download('punkt_tab')
|
|
|
|
|
|
# this is surely incomplete, but good enough possibly.
|
|
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
|
|
WS=" \t"
|
|
TERM_PUNC="!.?‽"
|
|
|
|
def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: str = TERM_PUNC) -> list[str]:
|
|
"""
|
|
Segementize a corpus into a list of separate sentences.
|
|
|
|
Sentence-terminating punctuation and quotations are used to separate each sentence, but included in the output.
|
|
|
|
Punctuation only apply if there is a space before or after them.
|
|
|
|
"""
|
|
# this is harder to do than you'd think just splitting on the sent_endings - quotations make things really complicated
|
|
# especially since ' is both a quotation container and an apostrophy; i guess we could only count it as an apostrophe
|
|
# if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
|
|
|
|
# FIXME
|
|
return nltk.sent_tokenize(corpus)
|
|
|
|
|
|
def english_sentence_tokenize(sentence: str) -> list[str]:
|
|
"""
|
|
Tokenize a sentence with the following rules:
|
|
|
|
* each word may be separated by a space or a punctuation
|
|
* punctuation is included as separate tokens
|
|
|
|
"""
|
|
start = 0
|
|
out = []
|
|
for idx, chr in enumerate(sentence):
|
|
if chr in PUNC:
|
|
if (start != idx):
|
|
out.append(sentence[start:idx])
|
|
out.append(chr)
|
|
start = idx+1
|
|
if chr in WS:
|
|
if (start != idx):
|
|
out.append(sentence[start:idx])
|
|
start = idx+1
|
|
|
|
return out
|