Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4c7c3d0d9b |
@ -4,4 +4,4 @@
|
|||||||
# This is free software, see the included LICENSE for terms and conditions.
|
# This is free software, see the included LICENSE for terms and conditions.
|
||||||
#
|
#
|
||||||
|
|
||||||
version = '0.4.0'
|
version = '0.4.1'
|
||||||
|
|||||||
@ -4,16 +4,6 @@
|
|||||||
# This is free software, see the included LICENSE for terms and conditions.
|
# This is free software, see the included LICENSE for terms and conditions.
|
||||||
#
|
#
|
||||||
|
|
||||||
# for now we'll use nltk but this is here to make it so we implement our own
|
|
||||||
import nltk
|
|
||||||
|
|
||||||
try:
|
|
||||||
nltk.sent_tokenize("foo bar")
|
|
||||||
except LookupError:
|
|
||||||
nltk.download('punkt')
|
|
||||||
nltk.download('punkt_tab')
|
|
||||||
|
|
||||||
|
|
||||||
# this is surely incomplete, but good enough possibly.
|
# this is surely incomplete, but good enough possibly.
|
||||||
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
|
PUNC="!@#^&*():/\\.,+-?|;'\"<>“”‘’‽=_$"
|
||||||
WS=" \t"
|
WS=" \t"
|
||||||
@ -33,8 +23,21 @@ def english_sentence_split(corpus: str, quotations: str = '"\'', sent_endings: s
|
|||||||
# if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
|
# if there's no spaces around it, otherwise treat it as a quotation. We'll need to write a proper parser for it.
|
||||||
|
|
||||||
# FIXME
|
# FIXME
|
||||||
return nltk.sent_tokenize(corpus)
|
# for now we'll use nltk but this is here to make it so we implement our own
|
||||||
|
try:
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
try:
|
||||||
|
nltk.sent_tokenize("foo bar")
|
||||||
|
except LookupError:
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('punkt_tab')
|
||||||
|
|
||||||
|
return nltk.sent_tokenize(corpus)
|
||||||
|
except ImportError:
|
||||||
|
import sys
|
||||||
|
print("Cannot import nltk, we require for the time being nltk to be installed to do english sentence splitting.")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
def english_sentence_tokenize(sentence: str) -> list[str]:
|
def english_sentence_tokenize(sentence: str) -> list[str]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[metadata]
|
[metadata]
|
||||||
name = carkov
|
name = carkov
|
||||||
version = 0.4.0
|
version = 0.4.1
|
||||||
description = A markov chainer library
|
description = A markov chainer library
|
||||||
author = Aldercone Studio
|
author = Aldercone Studio
|
||||||
author_email = alderconestudio@gmail.com
|
author_email = alderconestudio@gmail.com
|
||||||
@ -33,7 +33,6 @@ packages =
|
|||||||
zip_safe = true
|
zip_safe = true
|
||||||
install_requires =
|
install_requires =
|
||||||
unidecode
|
unidecode
|
||||||
nltk
|
|
||||||
msgpack
|
msgpack
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
|
|||||||
Reference in New Issue
Block a user