From 3fae360c2ec0f254a1a7d3ed1ae3a5210726732d Mon Sep 17 00:00:00 2001 From: Cassowary Date: Tue, 24 Mar 2026 19:33:30 -0700 Subject: [PATCH] Bump version to 0.4; fix some changes to analyze/english --- carkov/__init__.py | 2 +- carkov/analyze/english.py | 14 +++----------- setup.cfg | 2 +- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/carkov/__init__.py b/carkov/__init__.py index 346eb08..7ad4525 100644 --- a/carkov/__init__.py +++ b/carkov/__init__.py @@ -4,4 +4,4 @@ # This is free software, see the included LICENSE for terms and conditions. # -version = '0.2.0' +version = '0.4.0' diff --git a/carkov/analyze/english.py b/carkov/analyze/english.py index 9f3348b..ffd3737 100644 --- a/carkov/analyze/english.py +++ b/carkov/analyze/english.py @@ -4,16 +4,8 @@ # This is free software, see the included LICENSE for terms and conditions. # -import nltk - from .abstract import AbstractAnalyzer - - -try: - nltk.sent_tokenize("foo bar") -except LookupError: - nltk.download('punkt') - +from .utils import english_sentence_split, english_sentence_tokenize class English(AbstractAnalyzer): def __init__(self, order, filters=None): @@ -25,8 +17,8 @@ class English(AbstractAnalyzer): chunks = corpus.split('\n\n') ret = [] for chunk in chunks: - ret = ret + nltk.sent_tokenize(chunk) + ret = ret + english_sentence_split(chunk) return ret def tokenize_segment(self, segment): - return list(nltk.word_tokenize(segment)) + return list(english_sentence_tokenize(segment)) diff --git a/setup.cfg b/setup.cfg index 3b7b3f0..a0914b3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = carkov -version = 0.2.0 +version = 0.4.0 description = A markov chainer library author = Aldercone Studio author_email = alderconestudio@gmail.com