#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Apr 12 14:30:24 2018 @author: teacher """ import pandas as pd import numpy as np # read in the csv file accidents = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv") # ------------------------------------------------------- # STEP 1: estimate the probability that an accident results in an injury or fatality # set up the filters injury_filter = accidents['NUMBER OF PERSONS INJURED'] > 0 death_filter = accidents['NUMBER OF PERSONS KILLED'] >0 # apply the filters to get the dataframe with only those accidents causing injury or death dangerous_accidents = accidents[injury_filter | death_filter] # compute the number of accidents causing injury or death num_dangerous_accidents = len(dangerous_accidents) print("There were",num_dangerous_accidents,"dangerous accidents") # compute the total number of accidents in the dataset num_accidents = len(accidents) print("There were",num_accidents,"accidents total.") # probability an accident causes injury or death = # of accidents causing injury or death / # of accidents prob = num_dangerous_accidents/num_accidents print("The (estimated) probability an accident is dangerous:",prob ) # ------------------------------------------------------- # STEP 2: find number of night accidents # set the type of the TIME column to datatime accidents['TIME'] = pd.to_datetime(accidents['TIME']) # set up fitler to find night accidents night_filter = (accidents['TIME'].dt.hour >= 18) | (accidents['TIME'].dt.hour <= 5) # apply the filter to get a reduced dataframe of only night accidents all_night_accidents = accidents[night_filter] # count the number of nighttime accidents num_night_accidents = len(all_night_accidents) print("Number of night accidents:",num_night_accidents) # ------------------------------------------------------- # STEP 3: sample from the null distribution #population = ['I/D','N'] population = [1,0] # 1 represent dangerous accident, 0 represents non-dangerous accident weight = [prob,1-prob] # use the probability found in step 1 # create an empty list count_list = [] # repeatedly sample from the null distribution for i in range(10000): # create a sample equal to the number of night accidents, where the probability of # an accident resulting in an injury or death is as computed in step 1 sample = np.random.choice(population,p =weight,size = num_night_accidents) #print(sample) # fast way to count how many accidents in the sample resulted in injury or death # - each dangerous accident is represented by a 1, so adding the 1's gives the total number of dangerous accidents # - the other, non-dangerous accidents are represented by 0, which doesn't change the sum when added count = sum(sample) #print(count) # add the number of dangerous accidents to our list count_list.append(count) # plot a histogram of the counts of dangerous accidents count_series = pd.Series(count_list) count_series.plot.hist(bins=20) # -------------------------------------------------------- # STEP 4: find number of dangerous night accidents # use both the night and injury and death filters dangerous_night_accidents = accidents[night_filter & (injury_filter | death_filter)] # get the size of our reduced dataframe num_dangerous_night_accidents = len(dangerous_night_accidents) print("The number of dangerous night accidents in our data set",num_dangerous_night_accidents)