diff --git a/carkov/__init__.py b/carkov/__init__.py index 7ad4525..5983ee3 100644 --- a/carkov/__init__.py +++ b/carkov/__init__.py @@ -4,4 +4,4 @@ # This is free software, see the included LICENSE for terms and conditions. # -version = '0.4.0' +version = '0.4.1' diff --git a/carkov/analyze/utils.py b/carkov/analyze/utils.py index 9298f96..0e72906 100644 --- a/carkov/analyze/utils.py +++ b/carkov/analyze/utils.py @@ -4,16 +4,6 @@ # This is free software, see the included LICENSE for terms and conditions. # -# for now we'll use nltk but this is here to make it so we implement our own -import nltk - -try: - nltk.sent_tokenize("foo bar") -except LookupError: - nltk.download('punkt') - nltk.download('punkt_tab') - - # this is surely incomplete, but good enough possibly. PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$" WS=" \t" @@ -33,8 +23,21 @@ def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: s # if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it. # FIXME - return nltk.sent_tokenize(corpus) + # for now we'll use nltk but this is here to make it so we implement our own + try: + import nltk + try: + nltk.sent_tokenize("foo bar") + except LookupError: + nltk.download('punkt') + nltk.download('punkt_tab') + + return nltk.sent_tokenize(corpus) + except ImportError: + import sys + print("Cannot import nltk, we require for the time being nltk to be installed to do english sentence splitting.") + sys.exit(-1) def english_sentence_tokenize(sentence: str) -> list[str]: """ diff --git a/setup.cfg b/setup.cfg index a0914b3..4655570 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = carkov -version = 0.4.0 +version = 0.4.1 description = A markov chainer library author = Aldercone Studio author_email = alderconestudio@gmail.com @@ -33,7 +33,6 @@ packages = zip_safe = true install_requires = unidecode - nltk msgpack [options.entry_points]