-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlanguage.py
76 lines (55 loc) · 2.33 KB
/
language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import sys
from spacy.en import English
try:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
except ImportError:
print '[!] You need to install nltk (http://nltk.org/index.html)'
nlp = English()
#----------------------------------------------------------------------
def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
#----------------------------------------------------------------------
def detect_language(text):
"""
Calculate probability of given text to be written in several languages and
return the highest scored.
It uses a stopwords based approach, counting how many unique stopwords
are seen in analyzed text.
@param text: Text whose language want to be detected
@type text: str
@return: Most scored language guessed
@rtype: str
"""
ratios = _calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)
return most_rated_language
def tokenize_document(docpair, verbose=False):
"""
Gets tokens from a text in English
"""
if verbose:
sys.stdout.write('working on doc {}'.format(docpair[0]) + '\r')
return [x.lower_.encode('ascii', errors='ignore') for x in nlp(docpair[1])]