-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrangle.py
99 lines (63 loc) · 2.79 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# imports
import pandas as pd
import unicodedata
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
def bin_language(df):
'''takes in a dataframe and bin language and return a dataframe'''
for i, row in df.iterrows():
if str(row['language']) in ['Python', 'JavaScript', 'C++', 'Java']:
continue
else:
df.iloc[i]['language'] = 'Other'
return df
def clean_text(text, extra_stop_words=[],exclude_stop_words=[]):
'''takes in a dataframe, list of extra stop words, list of words to exclude stop words
clean text using lemmatization, lower string into lowercase, remove that is not alphabet, digits
remove words that are not in stopwords
rerun clean words'''
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
clean_text = (unicodedata.normalize('NFKD', text)
.encode('ascii', 'ignore')
.decode('utf-8', 'ignore')
.lower())
words = re.sub(r'[^\w\s]', ' ', clean_text).split()
clean_words = [wnl.lemmatize(word) for word in words if word not in stopwords]
return ' '.join(clean_words)
def prepare_df(df):
'''takes in a dataframe
add columns with clean data, lenght of data, number of unique words
return a dataframe'''
df = bin_language(df)
df = df[~(df.language =='Other')].reset_index().drop(columns='index')
df['readme_contents_clean'] = df['readme_contents'].apply(clean_text)
df['length'] = df['readme_contents'].str.len()
lists = []
for i,row in df.iterrows():
words = df.iloc[i]['readme_contents_clean']
lists.append(len(set(words)))
df['unique'] = pd.Series(lists)
return df
def train_val_test(df, target=None, stratify=None, seed=42):
'''Split data into train, validate, and test subsets with 60/20/20 ratio'''
train, val_test = train_test_split(df, train_size=0.6, random_state=seed)
val, test = train_test_split(val_test, train_size=0.5, random_state=seed)
return train, val, test
def x_y_split(df, target, seed=42):
'''
This function is used to split train, val, test into X_train, y_train, X_val, y_val, X_train, y_test
'''
train, val, test = train_val_test(df, target, seed)
X_train = train.readme_contents_clean
y_train = train[target]
X_val = val.readme_contents_clean
y_val = val[target]
X_test = test.readme_contents_clean
y_test = test[target]
return X_train, y_train, X_val, y_val, X_test, y_test