-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.py
35 lines (28 loc) · 1.26 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#Importing required packages and files
import pandas as pd
import numpy as np
import os
#Tools to build machine learning models and reports
from sklearn.model_selection import train_test_split
#Removes warnings and imporves asthenics
import warnings
warnings.filterwarnings("ignore")
def train_validate(df, stratify_col = None, random_seed=1969):
"""
This function takes in a DataFrame and column name for the stratify argument (defualt is None).
It will split the data into three parts for training, testing and validating.
"""
#This is logic to set the stratify argument:
stratify_arg = ''
if stratify_col != None:
stratify_arg = df[stratify_col]
else:
stratify_arg = None
#This splits the DataFrame into 'train' and 'test':
train, test = train_test_split(df, train_size=.8, stratify=stratify_arg, random_state = random_seed)
#The length of the stratify column changed and needs to be adjusted:
if stratify_col != None:
stratify_arg = train[stratify_col]
#This splits the larger 'train' DataFrame into a smaller 'train' and 'validate' DataFrames:
train, validate = train_test_split(train, train_size=.6, stratify=stratify_arg, random_state = random_seed)
return train, validate, test