-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04 Data_Analysis.py
233 lines (189 loc) · 8.84 KB
/
04 Data_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# How active are the users along the week?
# Create the "WeekDay" column based on the day of the week names
daily_activity['WeekDay'] = daily_activity['ActivityDate'].dt.day_name()
# Check which day people do more exercise
weekday_counts = daily_activity['WeekDay'].value_counts()
# Print the count of exercises for each day
print("Exercise counts for each day:")
print(weekday_counts)
# Exercise counts for each day
weekday_counts = {
'Sunday': 578,
'Monday': 576,
'Tuesday': 790,
'Wednesday': 770,
'Thursday': 742,
'Friday': 654,
'Saturday': 639,
}
# Define the order of weekdays based on the provided counts
weekday_order = list(weekday_counts.keys())
# Use a color-blind-friendly palette
colors = sns.color_palette("Set3")
# Set the figure size
plt.figure(figsize=(10, 6))
# Create a bar plot with different colors
bars = plt.bar(weekday_counts.keys(), weekday_counts.values(), color=colors)
# Add labels and title
plt.xlabel('Day of the Week')
plt.ylabel('Exercise Counts')
plt.title('Exercise Counts for Each Day')
# Add numbers on each bar
for bar, count in zip(bars, weekday_counts.values()):
plt.text(bar.get_x() + bar.get_width() / 2 - 0.1, bar.get_height() + 10, str(count), ha='center', color='black')
# Show the plot
plt.show()
# Average Activity Level for Each Day of the Week
active_minutes_columns = ['LightlyActiveMinutes', 'FairlyActiveMinutes', 'VeryActiveMinutes']
active_minutes_avg = daily_activity.groupby('WeekDay')[active_minutes_columns].mean().reindex(weekday_order)
# Set up the figure with larger size
plt.figure(figsize=(18, 12))
# Grouped bar plot with stacked bars for average active minutes
ax1 = plt.subplot(2, 1, 1)
active_minutes_avg.plot(kind='bar', stacked=True, color=['skyblue', 'lightgreen', 'salmon'], ax=ax1)
# Adding numbers to each bar for average active minutes
for i, day in enumerate(weekday_order):
total_height = 0
for col in active_minutes_columns:
value = active_minutes_avg.loc[day, col]
plt.text(i, total_height + value / 2, f'{value:.2f}', ha='center', va='center', color='black')
total_height += value
# Adding legend for average active minutes with more space
ax1.legend(["Light Active Minutes", "Fairly Active Minutes", "Very Active Minutes"], bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., ncol=1)
# Adding labels and title for average active minutes
ax1.set_xlabel('Day of the Week')
ax1.set_ylabel('Average Activity Intensity (min)')
ax1.set_title('Average Activity Level for Each Day of the Week')
# Average Activity Distance for Each Day of the Week
active_distance_columns = ['LightActiveDistance', 'ModeratelyActiveDistance', 'VeryActiveDistance']
active_distance_avg = daily_activity.groupby('WeekDay')[active_distance_columns].mean().reindex(weekday_order)
# Set up the figure with larger size
plt.figure(figsize=(18, 12))
# Grouped bar plot with stacked bars for average activity distance
ax2 = plt.subplot(2, 1, 2)
active_distance_avg.plot(kind='bar', stacked=True, color=['skyblue', 'lightgreen', 'salmon'], ax=ax2)
# Adding numbers to each bar for average activity distance
for i, day in enumerate(weekday_order):
total_height = 0
for col in active_distance_columns:
value = active_distance_avg.loc[day, col]
plt.text(i, total_height + value / 2, f'{value:.2f}', ha='center', va='center', color='black')
total_height += value
# Adding legend for average activity distance with more space
ax2.legend(["Light Active Distance", "Moderately Active Distance", "Very Active Distance"], bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., ncol=1)
# Adding labels and title for average activity distance
ax2.set_xlabel('Day of the Week')
ax2.set_ylabel('Average Active Distance (km)')
ax2.set_title('Average Active Distance for Each Day of the Week')
# Checking the columns of combined_df
combined_df.columns
# Calculate summary statistics for numeric columns
summary_stats = combined_df.describe()
# Display the summary statistics
(summary_stats)
# Checking if users meet WHO guidelines
# Aggregate data at the user level
user_aggregated_df = daily_activity.groupby('Id').agg({
'LightlyActiveMinutes': 'sum',
'FairlyActiveMinutes': 'sum',
'VeryActiveMinutes': 'sum',
'SedentaryMinutes': 'sum'
}).reset_index()
# Create a new column indicating whether the user meets the aerobic activity guidelines
user_aggregated_df['Meets_Aerobic_Guidelines'] = (
(user_aggregated_df['LightlyActiveMinutes'] >= 150) |
((user_aggregated_df['FairlyActiveMinutes'] + user_aggregated_df['VeryActiveMinutes']) >= 75) |
(((user_aggregated_df['LightlyActiveMinutes'] + user_aggregated_df['FairlyActiveMinutes'] + user_aggregated_df['VeryActiveMinutes']) >= 150) &
((user_aggregated_df['FairlyActiveMinutes'] + user_aggregated_df['VeryActiveMinutes']) <= 300))
)
# Create a new column indicating whether the user meets the muscle-strengthening guidelines
user_aggregated_df['Meets_Muscle_Strengthening_Guidelines'] = (
user_aggregated_df['FairlyActiveMinutes'] >= 2
)
# Create a new column indicating whether the user meets the sedentary behavior guidelines
user_aggregated_df['Meets_Sedentary_Behavior_Guidelines'] = (
user_aggregated_df['SedentaryMinutes'] <= 720 # WHO suggests limiting sedentary time, using 720 (12 hours)
)
# Print the count of users meeting each guideline
print(f"Number of users meeting aerobic activity guidelines: {user_aggregated_df['Meets_Aerobic_Guidelines'].sum()}")
print(f"Number of users meeting muscle-strengthening guidelines: {user_aggregated_df['Meets_Muscle_Strengthening_Guidelines'].sum()}")
print(f"Number of users meeting sedentary behavior guidelines: {user_aggregated_df['Meets_Sedentary_Behavior_Guidelines'].sum()}")
# Group by 'Id' and create a new column indicating sleep duration categories
sleepday['SleepDurationCategory'] = pd.cut(
sleepday.groupby('Id')['TotalHoursAsleep'].transform('mean'),
bins=[0, 7, 9, float('inf')],
labels=['Less than 7 hours', '7-9 hours', 'More than 9 hours']
)
# Count the number of unique users in each sleep duration category
unique_sleep_duration_counts = sleepday.groupby('SleepDurationCategory')['Id'].nunique()
# Display the counts
print(unique_sleep_duration_counts)
id_column_name = 'Id'
bmi_column_name = 'BMI'
classification_column_name = 'BMI_Classification'
# Create a new column 'BMI_Classification' to store the classification for each user
weight_log['BMI_Classification'] = pd.cut(
weight_log[bmi_column_name],
bins=[0, 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')],
labels=['Underweight', 'Normal weight', 'Pre-obesity', 'Obesity class I', 'Obesity class II', 'Obesity class III']
)
# Drop duplicate rows based on 'Id' to keep only unique users
unique_users_df = weight_log.drop_duplicates(subset=['Id'])
# Group by 'BMI_Classification' and count the occurrences for each classification
bmi_counts = unique_users_df['BMI_Classification'].value_counts()
# Display the counts for each BMI classification
print(bmi_counts)
# Get the 14 unique users from the heart_rate dataset
unique_users = heart_rate['Id'].unique()[:14]
# Filter heart_rate data for these users
filtered_heart_rate = heart_rate[heart_rate['Id'].isin(unique_users)]
# Calculate the average heart rate for each user
average_heart_rate = filtered_heart_rate.groupby(['Id'])['MinuteAverage'].mean().reset_index()
print(average_heart_rate)
# Distribution of Heart Rate in Minutes for Each User
plt.figure(figsize=(12, 8))
sns.boxplot(data=heart_rate, x='Id', y='MinuteAverage')
plt.title('Distribution of Heart Rate in Minutes for Each User')
plt.xticks(rotation=45, ha='right')
plt.show()
# Correlation Matrix:
# Find the number of unique users in combined_df dataframe
unique_users_count = combined_df['Id'].nunique()
print(f"Number of unique users: {unique_users_count}")
combined_df.columns
# Rename the column 'SedentaryActiveDistance' to 'SedentaryDistance'
combined_df.rename(columns={'SedentaryActiveDistance': 'SedentaryDistance'}, inplace=True)
# Correlation analysis with the updated column name
correlation_matrix = combined_df[['TotalSteps', 'TotalDistance', 'TotalActiveMinutes', 'TotalMinutes',
'LightlyActiveMinutes', 'FairlyActiveMinutes', 'VeryActiveMinutes',
'SedentaryMinutes', 'LightActiveDistance', 'ModeratelyActiveDistance',
'VeryActiveDistance', 'SedentaryDistance', 'Calories',
'TotalHoursAsleep', 'WeightKg', 'BMI', 'MinuteAverage']].corr()
# Set up the matplotlib figure
plt.figure(figsize=(12, 10))
# Customize the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 8})
# Rotate the axis labels for better readability
plt.xticks(rotation=90)
# Add a title
plt.title('Correlation Matrix')
# Display the plot
plt.show()