#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 29 13:09:54 2018 @author: teacher """ import pandas as pd import seaborn as sns import statsmodels.formula.api as smf import matplotlib.pyplot as plt fields = ['Year Built','Number Of Stories','Land Area (AC)','Rentable Building Area','Typical Floor Size','Number Of Elevators', 'Percent Leased'] ts = pd.read_csv("Times_Square_Property_Data__Commercial_and_Retail_properties_.csv", usecols = fields) # drop any row (axis = 0) that is missing some of the data # to drop a column instead: axis = 1 ts = ts.dropna(axis = 0, how = 'any') # rename columns to get rid of spaces ts.columns =['Year','Stories','Land_area','Rentable_area','Floor_size','Elevators','Percent_leased'] # get correlation matrix corr_matrix = ts.corr() print(corr_matrix) sns.heatmap(corr_matrix,cmap="PiYG") # Set up linear model with x = Elevators, y = Stories lm = smf.ols(formula = 'Stories ~ Elevators', data=ts).fit() plt.show() sns.regplot(x = 'Elevators',y='Stories', data=ts) print("Rsquared (Stories ~ Elevators):",lm.rsquared) # Set up linear model with x1 = Elevators, x2 = Land_area y = Stories lm2 = smf.ols(formula = 'Stories ~ Elevators + Land_area', data=ts).fit() print("Rsquared (Stories ~ Elevators + Land_area):", lm2.rsquared)