#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 29 13:09:54 2018

@author: teacher
"""

import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

fields = ['Year Built','Number Of Stories','Land Area (AC)','Rentable Building Area','Typical Floor Size','Number Of Elevators', 'Percent Leased']
ts = pd.read_csv("Times_Square_Property_Data__Commercial_and_Retail_properties_.csv", usecols = fields)

# drop any row (axis = 0) that is missing some of the data
# to drop a column instead: axis = 1
ts = ts.dropna(axis = 0, how = 'any')

# rename columns to get rid of spaces
ts.columns =['Year','Stories','Land_area','Rentable_area','Floor_size','Elevators','Percent_leased']

# get correlation matrix
corr_matrix = ts.corr()
print(corr_matrix)

sns.heatmap(corr_matrix,cmap="PiYG")

# Set up linear model with x = Elevators,  y = Stories
lm = smf.ols(formula = 'Stories ~ Elevators', data=ts).fit()

plt.show()
sns.regplot(x = 'Elevators',y='Stories', data=ts)

print("Rsquared (Stories ~ Elevators):",lm.rsquared)

# Set up linear model with x1 = Elevators, x2 = Land_area  y = Stories
lm2 = smf.ols(formula = 'Stories ~ Elevators + Land_area', data=ts).fit()
print("Rsquared (Stories ~ Elevators + Land_area):", lm2.rsquared)