#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 14:30:24 2018

@author: teacher
"""

import pandas as pd
import numpy as np

# read in the csv file
accidents = pd.read_csv("NYPD_Motor_Vehicle_Collisions.csv")

# -------------------------------------------------------
# STEP 1: estimate the probability that an accident results in an injury or fatality

# set up the filters
injury_filter = accidents['NUMBER OF PERSONS INJURED'] > 0
death_filter = accidents['NUMBER OF PERSONS KILLED'] >0

# apply the filters to get the dataframe with only those accidents causing injury or death
dangerous_accidents = accidents[injury_filter | death_filter]

# compute the number of accidents causing injury or death
num_dangerous_accidents = len(dangerous_accidents)
print("There were",num_dangerous_accidents,"dangerous accidents")

# compute the total number of accidents in the dataset
num_accidents = len(accidents)
print("There were",num_accidents,"accidents total.")

# probability an accident causes injury or death = # of accidents causing injury or death / # of accidents
prob = num_dangerous_accidents/num_accidents
print("The (estimated) probability an accident is dangerous:",prob )

# -------------------------------------------------------
#  STEP 2: find number of night accidents

# set the type of the TIME column to datatime
accidents['TIME'] = pd.to_datetime(accidents['TIME'])

# set up fitler to find night accidents
night_filter = (accidents['TIME'].dt.hour >= 18) | (accidents['TIME'].dt.hour <= 5)

# apply the filter to get a reduced dataframe of only night accidents
all_night_accidents = accidents[night_filter]

# count the number of nighttime accidents
num_night_accidents = len(all_night_accidents)
print("Number of night accidents:",num_night_accidents)

# -------------------------------------------------------
# STEP 3:  sample from the null distribution
#population = ['I/D','N']
population = [1,0]      # 1 represent dangerous accident, 0 represents non-dangerous accident
weight = [prob,1-prob]   # use the probability found in step 1

# create an empty list
count_list = []
# repeatedly sample from the null distribution
for i in range(10000):
    # create a sample equal to the number of night accidents, where the probability of 
    # an accident resulting in an injury or death is as computed in step 1
    sample = np.random.choice(population,p =weight,size = num_night_accidents)
    #print(sample)
    
    # fast way to count how many accidents in the sample resulted in injury or death
    #  - each dangerous accident is represented by a 1, so adding the 1's gives the total number of dangerous accidents
    #  - the other, non-dangerous accidents are represented by 0, which doesn't change the sum when added
    count = sum(sample)
    #print(count)
    
    # add the number of dangerous accidents to our list
    count_list.append(count)
    
# plot a histogram of the counts of dangerous accidents 
count_series = pd.Series(count_list)
count_series.plot.hist(bins=20)

# --------------------------------------------------------
# STEP 4: find number of dangerous night accidents
# use both the night and injury and death filters
dangerous_night_accidents = accidents[night_filter & 
                                      (injury_filter | death_filter)]
                                     
# get the size of our reduced dataframe
num_dangerous_night_accidents = len(dangerous_night_accidents)
print("The number of dangerous night accidents in our data set",num_dangerous_night_accidents)