-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03 Data_Transformation.py
119 lines (87 loc) · 4.56 KB
/
03 Data_Transformation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Check Data type
print("Daily Activity data type is", daily_activity["ActivityDate"].dtypes, "data type")
print("Sleepday data type is", sleepday["SleepDay"].dtypes, "data type")
print("Weight Log data type is", weight_log["Date"].dtypes, "data type")
print("Heart Rate data type is", heart_rate["Time"].dtypes, "data type")
# Transform Date to DateTime data type
daily_activity["ActivityDate"] = pd.to_datetime(daily_activity["ActivityDate"])
sleepday['SleepDay'] = pd.to_datetime(sleepday['SleepDay'])
weight_log['Date'] = pd.to_datetime(weight_log['Date'])
heart_rate['Time'] = pd.to_datetime(heart_rate['Time'])
# Check the changes
print("Daily Activity data type is", daily_activity["ActivityDate"].dtypes, "data type")
print("Sleepday data type is", sleepday["SleepDay"].dtypes, "data type")
print("Weight Log data type is", weight_log["Date"].dtypes, "data type")
print("Heart Rate data type is", heart_rate["Time"].dtypes, "data type")
# Check the dataframe shape
print("Daily Activity:",daily_activity.shape)
print("Sleepday:",sleepday.shape)
print("Weight Log:",weight_log.shape)
print("Heart Rate:",heart_rate.shape)
# Checking duplicates
print(daily_activity['ActivityDate'].duplicated().any())
print(sleepday['SleepDay'].duplicated().any())
print(weight_log['Date'].duplicated().any())
print(heart_rate['Time'].duplicated().any())
# Removing duplicates
# 1. daily_activity:
daily_activity.drop_duplicates(inplace=True)
# 2. sleep_day:
sleepday.drop_duplicates(inplace=True)
# 3. weight_log:
weight_log.drop_duplicates(inplace=True)
# 4. heart_rate:
heart_rate.drop_duplicates(inplace=True)
# Checking missing values
print(daily_activity['ActivityDate'].isnull().any())
print(sleepday['SleepDay'].isnull().any())
print(weight_log['Date'].isnull().any())
print(heart_rate['Time'].isnull().any())
# Renaming Columns for Simplified Merge
# Replace date/time column names with 'ActivityDate' in the daily_activity dataframe
daily_activity = daily_activity.rename(columns={'ActivityDate': 'ActivityDate'})
# Replace date/time column names with 'ActivityDate' in the sleepday dataframe
sleepday = sleepday.rename(columns={'SleepDay': 'ActivityDate'})
# Replace date/time column names with 'ActivityDate' in the weight_log dataframe
weight_log = weight_log.rename(columns={'Date': 'ActivityDate'})
# Replace date/time column names with 'ActivityDate' in the heart_rate dataframe
heart_rate = heart_rate.rename(columns={'Time': 'ActivityDate'})
# Check the dataframe columns
print("Daily Activity:",daily_activity.columns)
print("Sleepday:",sleepday.columns)
print("Weight Log:",weight_log.columns)
print("Heart Rate:",heart_rate.columns)
# Concatenate the DataFrames
# Step 1: Concatenate the DataFrames vertically to combine all the data
combined_df = pd.concat([daily_activity, sleepday, weight_log, heart_rate], ignore_index=True)
# Step 2: Sort the combined DataFrame by "ActivityDate"
combined_df.sort_values(by='ActivityDate', inplace=True)
# Step 3: Merge the data using forward fill (ffill) to fill NaN values in subsequent columns
combined_df.ffill(inplace=True)
print(combined_df)
# Checking duplicates
print(combined_df.duplicated().any())
# Checking missing values
print(combined_df.isnull().any())
# Handle missing values and duplicates
# Replace missing values in numeric columns with their mean
numeric_cols = ['TotalMinutesAsleep', 'TotalTimeInBedMin', 'TotalHoursAsleep', 'TotalTimeInBedHour', 'WeightKg', 'WeightPounds', 'BMI', 'MinuteAverage', 'HourlyAverage']
combined_df[numeric_cols] = combined_df[numeric_cols].fillna(combined_df[numeric_cols].mean())
# Reset the index after dropping rows
combined_df.reset_index(drop=True, inplace=True)
# Check if any missing values remain
print(combined_df.isnull().sum())
#Removing duplicates
combined_df.drop_duplicates(inplace=True)
print(combined_df.head())
# Create the "WeekDay" column based on the day of the week (Monday: 0, Sunday: 6)
combined_df.loc[:, "WeekDay"] = combined_df["ActivityDate"].dt.weekday
# Create a new list called new_cols with the desired column order for the combined_df DataFrame
new_cols = ["Id", "ActivityDate", "WeekDay",'TotalSteps', 'TotalDistance', 'TotalActiveMinutes', 'TotalMinutes',
'TotalActiveHours', 'LightlyActiveMinutes', 'FairlyActiveMinutes',
'VeryActiveMinutes', 'LightActiveDistance', 'ModeratelyActiveDistance',
'VeryActiveDistance', 'Calories', 'TotalHoursAsleep',
'WeightKg', 'Fat', 'BMI', 'MinuteAverage', 'HourlyAverage']
# Reorder the columns of the merged_data DataFrame based on the new_cols list
combined_df = combined_df[new_cols]
print(combined_df)