# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Load train and test datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head(10)

# Overview of the data
print("Dataset Info:")
train.info()

# Statistical summary of numerical and categorical features
print("\nSummary Statistics:")
display(train.describe())

# Check for missing values
print("\nMissing Values:")
display(train.isnull().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Summary Statistics:

Missing Values:

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Create a grid for plots
fig, top = plt.subplots(1, 4, figsize=(16, 6))
top = top.flatten()
fig.suptitle("Survival, Class, Gender, Family Size, Age, Fare Distribution", fontsize=16)

# Survival count
sns.countplot(ax=top[0], x='Survived', data=train)
top[0].set_title('Survival Count \n(0 = did not survive, 1 = survived)')
top[0].set_xlabel('Survived')
top[0].set_ylabel('Count')

# Passenger class distribution
sns.countplot(ax=top[1], x='Pclass', data=train)
top[1].set_title('Passenger Class Distribution')
top[1].set_xlabel('Pclass')
top[1].set_ylabel('Count')

# Gender distribution
sns.countplot(ax=top[2], x='Sex', data=train)
top[2].set_title('Gender Distribution')
top[2].set_xlabel('Gender')
top[2].set_ylabel('Count')

# Family size distribution
family_sizes = train['SibSp'] + train['Parch'] + 1
sns.countplot(ax=top[3], x=family_sizes, data=train)
top[3].set_title('Family Size Distribution')
top[3].set_xlabel('Family Size')
top[3].set_ylabel('Count')

fig, bottom = plt.subplots(1, 2, figsize=(16, 6))
bottom = bottom.flatten()

# Age distribution
sns.histplot(train['Age'].dropna(), bins=30, ax=bottom[0])
bottom[0].set_title('Age Distribution')
bottom[0].set_xlabel('Age')
bottom[0].set_ylabel('Frequency')

# Fare distribution
sns.histplot(train['Fare'].dropna(), bins=30, ax=bottom[1])
bottom[1].set_title('Fare Distribution')
bottom[1].set_xlabel('Fare')
bottom[1].set_ylabel('Frequency');

# Create a grid for plots
fig, top = plt.subplots(1, 3, figsize=(16, 6))
top = top.flatten()
fig.suptitle("Survival Rates by Key Features", fontsize=16)

# Survival by gender
sns.countplot(ax=top[0], x='Sex', hue='Survived', data=train)
top[0].set_title('Survival by Gender')
top[0].set_xlabel('Sex')
top[0].set_ylabel('Count')

# Survival by class
sns.countplot(ax=top[1], x='Pclass', hue='Survived', data=train)
top[1].set_title('Survival by Passenger Class')
top[1].set_xlabel('Pclass')
top[1].set_ylabel('Count')

# Survival by family size
sns.countplot(ax=top[2], x=family_sizes, hue='Survived', data=train)
top[2].set_title('Survival by Family Size')
top[2].set_xlabel('Family Size')
top[2].set_ylabel('Count')

fig, bottom = plt.subplots(1, 2, figsize=(16, 6))
bottom = bottom.flatten()

# Survival by age
sns.histplot(data=train, x='Age', hue='Survived', multiple='stack', bins=30, ax=bottom[0])
bottom[0].set_title('Age Distribution by Survival')
bottom[0].set_xlabel('Age')
bottom[0].set_ylabel('Count')

# Survival by fare
sns.histplot(data=train, x='Fare', hue='Survived', multiple='stack', bins=30, ax=bottom[1])
bottom[1].set_title('Fare Distribution by Survival')
bottom[1].set_xlabel('Fare')
bottom[1].set_ylabel('Count');

def preprocess(df):
    df = df.copy()

    # Convert categorical features to numerical
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # Create family size column
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Handle missing values 
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df = predict_age(df)
    
    # Drop unnecessary columns
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    return df

def predict_age(df):
    df = df.copy()
    
    # Features to use to predict age
    features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Embarked']
    
    # Split known and unknown age rows
    known_age = df[df['Age'].notna()]
    unknown_age = df[df['Age'].isna()]

    # Define features and target variable
    X = known_age[features]
    y = known_age['Age']

    # Train linear regression model
    model = LinearRegression()
    model.fit(X, y)
        
    # Predict missing ages
    df.loc[df['Age'].isna(), 'Age'] = model.predict(unknown_age[features])
    return df

# Preprocess the train and test data
train_cleaned = preprocess(train)
test_cleaned = preprocess(test)

train_cleaned.head(10)

X = train_cleaned.drop(["Survived", "PassengerId"], axis=1)
y = train_cleaned["Survived"]

X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=42)

# Train random forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_validate)

# Evaluation metrics
print(f"\nValidation Accuracy : {accuracy_score(y_validate, predictions):.4f}")
print(f"Precision           : {precision_score(y_validate, predictions):.4f}")
print(f"Recall              : {recall_score(y_validate, predictions):.4f}")
print(f"F1 Score            : {f1_score(y_validate, predictions):.4f}")

# Create and plot the confusion matrix
ConfusionMatrixDisplay.from_predictions(y_validate, predictions, cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Feature importance
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': model.feature_importances_}
                                  ).sort_values('Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

Validation Accuracy : 0.8268
Precision           : 0.7945
Recall              : 0.7838
F1 Score            : 0.7891

Feature Importances:
      Feature  Importance
2         Age    0.272827
1         Sex    0.262662
5        Fare    0.249354
0      Pclass    0.072113
7  FamilySize    0.053230
6    Embarked    0.032754
3       SibSp    0.032521
4       Parch    0.024539

# Select features from cleaned test set
X_test = test_cleaned.drop(["PassengerId"], axis=1)

# Generate predictions
test_predictions = model.predict(X_test)

# Create submission data frame
submission = pd.DataFrame({
    "PassengerId": test_cleaned["PassengerId"],
    "Survived": test_predictions
})

# Save to CSV
submission.to_csv("submission.csv", index=False)

submission.head(5)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked	FamilySize
0	1	0	3	0	22.000000	1	0	7.2500	0.0	2
1	2	1	1	1	38.000000	1	0	71.2833	1.0	2
2	3	1	3	1	26.000000	0	0	7.9250	0.0	1
3	4	1	1	1	35.000000	1	0	53.1000	0.0	2
4	5	0	3	0	35.000000	0	0	8.0500	0.0	1
5	6	0	3	0	27.737912	0	0	8.4583	2.0	1
6	7	0	1	0	54.000000	0	0	51.8625	0.0	1
7	8	0	3	0	2.000000	3	1	21.0750	0.0	5
8	9	1	3	1	27.000000	0	2	11.1333	0.0	3
9	10	1	2	1	14.000000	1	0	30.0708	1.0	2

	PassengerId	Survived
0	892	0
1	893	0
2	894	0
3	895	1
4	896	0

Titanic - Machine Learning from Disaster¶

Adam Robinson¶

1. Importing Libraries¶

2. Loading the data¶

3. Exploratory Data Analysis¶

4. Data Preprocessing¶

5. Model Training¶

6. Model Evaluation¶

7. Test Predictions¶