#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 13 13:16:33 2018 @author: teacher """ import matplotlib.pyplot as plt import pandas as pd import math # Define the data type for two of the columns to get rid of warning message. dtypes = {'Landmark':'str','Facility Type':'str'} # Read in the CSV file, telling Pandas to read the 'Created Date' column as datetime types. potholes = pd.read_csv("311_Service_Requests_potholes.csv",parse_dates=['Created Date'],dtype = dtypes) # We don't need this line, since we are now using the parse_dates option in read_csv() above #potholes['Created Date'] = pd.to_datetime(potholes['Created Date']) # Get the counts of how many pothole requests were made on each date pothole_counts = potholes['Created Date'].dt.date.value_counts() # Plot a line graph of the number of pothole requests each day. # The option rot=90 rotates the x axis labels by 90 degrees. pothole_counts.plot(rot=90) plt.show() # Plot a histogram of the number of pothole requests each day. pothole_counts.plot.hist(bins = 20) plt.show() # Population = pothole counts for each day of the year # Compute population mean mu = pothole_counts.mean() print("Population mean is",mu) # Compute standard deviation of the population # use option ddof = 0 to get population standard deviation sigma = pothole_counts.std(ddof = 0) # define n = sample size n = 10000 # create an empty list for the sample means sample_means_list = [] # start a loop for i in range(10000): # inside the loop: # sample from the population pothole_sample = pothole_counts.sample(n, replace=True) # compute mean of sample sample_mean = pothole_sample.mean() # store sample mean in list sample_means_list.append(sample_mean) # convert list to Series sample_means_series = pd.Series(sample_means_list) # plot histogram of series sample_means_series.plot.hist(bins =40) # print the mean of the samples means (series) print("Mean of sample means is",sample_means_series.mean()) # print the standard deviation of the sample means print("Standard deviation of sample means is",sample_means_series.std(ddof=0)) # value of the standard deviation of the sample means predicted by the Central Limit Theorem: print("sigma/sqrt(n) is",sigma/math.sqrt(n))