# Read in the training data (passengers with whether or not they survived) train <- read.csv("titanic_train.csv",stringsAsFactors = TRUE) # Read in the test data (passengers without the Survived column, # which we are trying to predict) test <- read.csv("titanic_test.csv",stringsAsFactors = TRUE) # Load the package (R version of a Python module) rpart, # which has functions (commands) for decision trees. library(rpart) # "Fit" the model (the decision tree) to the training data. # We will use the information in columns Pclass, Sex, Age, SibSp, # Parch, Fare, Embarked (independent variables) to make our decision tree. # We are trying to predict Survived (dependent variable). fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch +Fare + Embarked, data = train, method = "class") # Plot the decision tree (the model). plot(fit) text(fit) # To plot a fancier picture of the decision tree: # If you haven't already, install the package rpart.plot install.packages('rpart.plot') # Load the package into our working space library(rpart.plot) # Make a fancier plot of the decision tree. prp(fit) # Use the decision tree to make predictions of who in the test data survived Prediction <- predict(fit,test,type = 'class') # Create a new dataframe with only the PassengerId and our prediction. submit <-data.frame(PassengerId = test$PassengerId, Survived = Prediction) # Write this new dataframe to a csv file, for submission to Kaggle. write.csv(submit, file = "decisiontree.csv", row.names = FALSE)