Predicting survivors on titanic ship using machine learning
Case description
Problem – Predicting survivors on titanic ship using machine learning.
This case study is part of the Kaggle competition. You can make the submission of your prediction on Kaggle.
This data is also available at https://www.kaggle.com/c/titanic/data
Data / Variables description
| Variable | Description | Category | 
| survival | Survival | 0 = No, 1 = Yes | 
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd | 
| sex | Sex | |
| Age | Age in years | |
| sibsp | # of siblings / spouses aboard the Titanic | |
| parch | # of parents / children aboard the Titanic | |
| ticket | Ticket number | |
| fare | Passenger fare | |
| cabin | Cabin number | |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton | 
Download dataset to practice
Solution with R codes
1. Download and save the above dataset (Train and Test) on your desktop.
2. Create a project in RStudio
If you are new to RStudio, learn how to create project in RStudio
https://lightgray-cattle-549464.hostingersite.com/post/how-to-create-a-project-in-r-studio
3. Link the saved dataset and set up the working directory
R codes
Data Exploration
#Reading the CSV file or loading the raw datatrain <- read.csv("train.csv", header=TRUE)test <- read.csv("test.csv", header=TRUE)# Print train and test to the consoletraintestView(train)View(test)head(train)head(test)# Exploring the data types in the data set str(train)str(test)
Missing value analysisis.na(train$Pclass) # this command returns TRUE if there is a missing value otherwise false# storing these value in a dataframetable(is.na(train$Pclass))missing_Pclass <- data.frame (is.na(train$Pclass))View(missing_Pclass)table(missing_Pclass) # no missing values # Do this for other variable as wellmissing_train_Sex <- data.frame(is.na(train$Sex))missing_train_Age <- data.frame(is.na(train$Age))missing_train_Sibsp <- data.frame(is.na(train$SibSp))missing_train_Parch <- data.frame(is.na(train$Parch))missing_train_Ticket <- data.frame(is.na(train$Ticket))missing_train_Fare <- data.frame(is.na(train$Fare))missing_train_Cabin <- data.frame(is.na(train$Cabin))missing_train_Embarked <- data.frame(is.na(train$Embarked))#Tabulate missing valuestable(missing_train_Sex)table(missing_train_Age)table(missing_train_Sibsp)table(missing_train_Parch) table(missing_train_Ticket) table(missing_train_Fare)table(missing_train_Cabin) table(missing_train_Embarked)table(train$Embarked)train$Embarked#For test datamissing_test_Sex <- data.frame(is.na(test$Sex))missing_test_Age <- data.frame(is.na(test$Age))missing_test_Sibsp <- data.frame(is.na(test$SibSp))missing_test_Parch <- data.frame(is.na(test$Parch))missing_test_Ticket <- data.frame(is.na(test$Ticket))missing_test_Fare <- data.frame(is.na(test$Fare))missing_test_Cabin <- data.frame(is.na(test$Cabin))missing_test_Embarked <- data.frame(is.na(test$Embarked))#Tabulate missing valuestable(missing_test_Sex)table(missing_test_Age)table(missing_test_Sibsp)table(missing_test_Parch) table(missing_test_Ticket) table(missing_test_Fare)table(missing_test_Cabin) table(missing_test_Embarked)table(is.na(train$Pclass))table(is.na(train$Sex))table(is.na(train$Age))table(is.na(train$SibSp))table(is.na(train$Parch))table(is.na(train$Ticket))table(is.na(train$Fare))table(is.na(train$Cabin))table(is.na(train$Embarked))
Exploratroy data analysis
 
Frequency tables and descriptive statistics
# converting Pclass into factortrain$Pclass <- as.factor(train$Pclass)table(train$Pclass)train$Sex <- as.factor(train$Sex)table(train$Sex)train$SibSp <- as.factor(train$SibSp)table(train$SibSp)train$Parch <- as.factor(train$Parch)table(train$Parch)train$Embarked <- as.factor(train$Embarked)table(train$Embarked)train$Survived <- as.factor(train$Survived)table(train$Survived)#test datatest$Pclass <- as.factor(test$Pclass)table(test$Pclass)test$Sex <- as.factor(test$Sex)table(test$Sex)test$SibSp <- as.factor(test$SibSp)table(test$SibSp)test$Parch <- as.factor(test$Parch)table(test$Parch)test$Embarked <- as.factor(test$Embarked)table(test$Embarked)           # descriptive statistics for continous variable (train data)summary(train$Age)sd(train$Age, na.rm=TRUE)mean(train$Age, na.rm=TRUE)# descriptive statistics for continous variable(test data)summary(test$Age)sd(test$Age, na.rm=TRUE)mean(test$Age, na.rm=TRUE)# Replacing the missing values with meantrain$Age[is.na(train$Age)] <- mean (train$Age, na.rm=TRUE)# Replacing the missing values with meantest$Age[is.na(test$Age)] <- mean (test$Age, na.rm=TRUE)## Add survived column for predictiontest$Survived <- NAtest$Survived <- factor(test$Survived, levels = c("1", "0") )levels(test$Survived)levels(train$Survived) <- levels(test$Survived)# Survival rates in absolute numberstable(train$Survived)# As proportionsprop.table(table(train$Survived))
Bivariate relationships
# Two-way comparison: Sex and Survivedtable(train$Sex, train$Survived)# Two-way comparison: row-wise proportionsprop.table(table(train$Sex, train$Survived))
Make prediction using Zero classification model
Either everyone survives or everyone dies
# Prediction 1 - everyone dies
everyone_dies <- testeveryone_dies$Survived <- 0View(everyone_dies)#submit a csv file with the PassengerId as well as our Survived predictions to Kagglesubmit <- data.frame(PassengerId = everyone_dies$PassengerId, Survived = everyone_dies$Survived)write.csv(submit, file = "everyone_dies.csv", row.names = FALSE)# Prediction 2 - Everyone surviveseveryonesurvives <-testeveryonesurvives$Survived <- 1View(everyonesurvives)#submit a csv file with the PassengerId as well as our Survived predictions to Kagglesubmit <- data.frame(PassengerId = everyonesurvives$PassengerId, Survived = everyonesurvives$Survived)write.csv(submit, file = "everyone_survives.csv", row.names = FALSE)# Prediction 3 -  All female survived / Was women and children first rule followed?# Let us try to understand the relationship between survival and gender by doing cross tabulation# Cross tabsinstall.packages("gmodels")library(gmodels)CrossTable (train$Survived, train$Sex)Allfemalesurvived <- test# Initialize a Survived column to 0Allfemalesurvived$Survived <- 0# Set Survived to 1 if Sex equals "female"Allfemalesurvived$Survived[Allfemalesurvived$Sex== "female"]  <-1#import the .CSV filesubmit <- data.frame(PassengerId = Allfemalesurvived$PassengerId, Survived = Allfemalesurvived$Survived)write.csv(submit, file = "femalessurvived.csv", row.names = FALSE)
Supervised Machine learning Algorithms for predictive modeling.
Logistic regression
Train the model using the training sets and check scorelogistic <- glm(formula = Survived ~ Pclass + Sex + Embarked + Age, data = train,  family ="binomial")summary(logistic)
# Let us make prediction using logistic modelmy_log_prediction <- predict(logistic, test, type = "response")# Suppose you wish to calculate probabilitysubmit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction)write.csv(submit, file = "logistic_prob.csv", row.names = FALSE)# Suppose you wish to calculate classifcationpredicted.classes <- ifelse(predicted > 0.5, "1", "0")table(predicted.classes)submit <- data.frame(PassengerId = test$PassengerId, Survived = predicted.classes)write.csv(submit, file = "logistic_class.csv", row.names = FALSE)submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction, Classification = predicted.classes)write.csv(submit, file = "logistic_class_prob.csv", row.names = FALSE)
Decision Trees
CART decision tree algorithmlibrary(rpart)install.packages('rattle')install.packages('rpart.plot')install.packages('RColorBrewer')library(rattle)library(rpart.plot)fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Embarked, data=train, method="class")# fit is our decision tree model # let us visualize the treeplot(fit)text(fit)fancyRpartPlot(fit)
# Let us make the prediction using decision tree modelPrediction <- predict(fit, test, type = "class")submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)