-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path# - - coding utf-8 - -.py
115 lines (84 loc) · 3.71 KB
/
# - - coding utf-8 - -.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""data analysis.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1CJLUdeTEnpqWA1-UQhC9329AFi08GrBs
#Step 1: Loading the Dataset and Initial Inspection
"""
import pandas as pd
# Load the Titanic dataset into a Pandas DataFrame
url = 'https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv'
titanic_data = pd.read_csv(url)
# Display basic information about the dataset
print(titanic_data.info())
print(titanic_data.head())
"""#Step 2: Preprocessing the Data"""
# Handling missing values
titanic_data.isnull().sum() # Check for missing values in columns
# Fill missing values in 'Age' column with the median age ru
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
# Drop 'Cabin' column due to many missing values
titanic_data.drop('Cabin', axis=1, inplace=True)
# Handling categorical variables like 'Sex' and 'Embarked'
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data['Embarked'] = titanic_data['Embarked'].fillna('S')
titanic_data['Embarked'] = titanic_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
# Verify changes
print(titanic_data.isnull().sum()) # Check for missing values again
print(titanic_data.head())
"""#Step 3: Exploratory Data Analysis (EDA) and Visualization"""
import matplotlib.pyplot as plt
import seaborn as sns
# Summary statistics
print(titanic_data.describe())
# Visualization: Survived vs. Not Survived
sns.countplot(x='Survived', data=titanic_data)
plt.title('Survival Count (0 = Not Survived, 1 = Survived)')
plt.show()
# Visualization: Survival by Gender
sns.countplot(x='Survived', hue='Sex', data=titanic_data)
plt.title('Survival Count by Gender')
plt.legend(['Male', 'Female'])
plt.show()
# Visualization: Survival by Passenger Class
sns.countplot(x='Survived', hue='Pclass', data=titanic_data)
plt.title('Survival Count by Passenger Class')
plt.legend(['1st Class', '2nd Class', '3rd Class'])
plt.show()
# Visualization: Age distribution
sns.histplot(titanic_data['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
"""#Step 4: Mapping Questions to Generate Answers
##Question 1: What is the survival rate based on gender?
"""
# Calculate survival rate by gender
survival_rate_gender = titanic_data.groupby('Sex')['Survived'].mean()
print("Survival Rate by Gender:")
print(survival_rate_gender)
"""##Question 2: Did passenger class affect survival chances?"""
# Calculate survival rate by passenger class
survival_rate_class = titanic_data.groupby('Pclass')['Survived'].mean()
print("\nSurvival Rate by Passenger Class:")
print(survival_rate_class)
"""##Question 3: Is there a relationship between age and survival?"""
# Calculate survival rate by age group
titanic_data['Age_Group'] = pd.cut(titanic_data['Age'], bins=[0, 18, 30, 50, 80])
survival_rate_age = titanic_data.groupby('Age_Group')['Survived'].mean()
print("\nSurvival Rate by Age Group:")
print(survival_rate_age)
"""##Question 4: How did the fare paid correlate with survival?"""
# Calculate survival rate by fare
titanic_data['Fare_Group'] = pd.qcut(titanic_data['Fare'], q=4)
survival_rate_fare = titanic_data.groupby('Fare_Group')['Survived'].mean()
print("\nSurvival Rate by Fare Group:")
print(survival_rate_fare)
"""##Question 5: Did having family aboard influence survival rates?"""
# Calculate survival rate by family presence
titanic_data['Family'] = titanic_data['SibSp'] + titanic_data['Parch']
titanic_data['Family'] = titanic_data['Family'].apply(lambda x: 1 if x > 0 else 0)
survival_rate_family = titanic_data.groupby('Family')['Survived'].mean()
print("\nSurvival Rate by Family Presence:")
print(survival_rate_family)