# Load the training data into a dataframe, with all strings being read in as factors (categories for data). train <- read.csv("titanic_train.csv",stringsAsFactors = TRUE) # The top line of the output of str() gives the number of rows of data in the dataframe. str(train) # Make a subset of the data that is only the female passengers on the ship. women <- subset(train,subset = Sex == "female") # Display this dataframe to see how many rows, and hence women were on the ship. str(women) # An alternative way to get the count for each factor in the column. summary(train$Sex) # Convert the counts into proportions (probabilities) prop.table(table(train$Sex)) # Display the proportion of passengers having each class of ticket. prop.table(table(train$Pclass)) # Display the proportion of passengers that have a specific sex and ticket class. prop.table(table(train$Sex,train$Pclass)) # Display the proportion of passengers of each sex and survival outcome (1 = survived, 0 = died) prop.table(table(train$Sex,train$Survived)) # Display the proportion of passengers with each ticket class and survival outcome prop.table(table(train$Pclass,train$Survived)) # We will now try to predict whether the passengers in the test dataset survived or not. # Load the test data into a dataframe. test <- read.csv("titanic_test.csv",stringsAsFactors = TRUE) # Create a new column in this dataframe for our predictions. To start, we fill it with all 0's. test$Survived <- rep(0, 418) # We will predict all the women survived and all the men died. # Set the Survived column to 1 if the Sex column of that row is female. test$Survived[test$Sex == 'female'] <- 1 # To submit the predictions to Kaggle, we need a csv file with exactly two columns: PassengerId and Survived. # We first make a dataframe containing just these columns. submit <- data.frame(test$PassengerId,test$Survived) # and write the dataframe to a csv file. write.csv(submit, file = "submission.csv", row.names = FALSE)