forked from swirlai/swirl-search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk.py
37 lines (27 loc) · 992 Bytes
/
nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
'''
@author: Sid Probstein
@contact: sid@swirl.today
'''
import logging as logger
from django.conf import settings
from nltk.corpus import stopwords
module_name = 'nltk.py'
try:
SWIRL_DEFAULT_QUERY_LANGUAGE = getattr(settings, 'SWIRL_DEFAULT_QUERY_LANGUAGE', 'english')
stopwords = set(stopwords.words(SWIRL_DEFAULT_QUERY_LANGUAGE))
except OSError:
logger.warning(f"{module_name}: Warning: No stopwords for language: {SWIRL_DEFAULT_QUERY_LANGUAGE}, check SWIRL_DEFAULT_QUERY_LANGUAGE in swirl_server/settings.py")
logger.warning(f"{module_name}: Warning: Using english stopwords")
stopwords = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize.punkt import PunktToken
def is_punctuation(c):
if not c:
return False
if len(c) > 1:
return False
t = PunktToken(c)
return not t.is_non_punct