taxi <-read.csv("Green_Taxi_Trip_Data_Feb2_2016.csv") # make a new data frame containing only rows where # Passenger_count is 1 solo_passengers_df <- subset(taxi, Passenger_count == 1) # check above command worked str(solo_passengers_df) head(solo_passengers_df) # make a new data frame containing only rows where # Passenger_count is >= 2 multi_passenger_df <- subset(taxi,Passenger_count >= 2) # check code worked head(multi_passenger_df) # hypothesis test comparing means of two groups t.test(solo_passengers_df$Trip_distance,multi_passenger_df$Trip_distance) # make a new column with mean cost per passenger (divide Fare_amount by # Passenger_count in each row) taxi$Cost_per_passenger <- taxi$Fare_amount/taxi$Passenger_count # create a new column (Total_fare) that is the sum of the columns # Fare_amount, Extra, MTA_tax, Tip_amount, and Tolls_amount # ie. the values in each row in those columns are added up taxi$Total_fare <- taxi$Fare_amount + taxi$Extra + taxi$MTA_tax + taxi$Tip_amount + taxi$Tolls_amount