#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 27 14:08:22 2018 @author: teacher """ import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import statsmodels.formula.api as smf # import model library # Clean data as in Lab 14. We also need to skip the final two lines of the CSV file because they are empty. fields = ['Major','Unemployment Rate','Underemployment Rate','Median Wage Early Career','Median Wage Mid-Career','Share with Graduate Degree'] labor = pd.read_csv("labor-market-for-recent-college-grads.csv",skiprows =13, skipfooter = 2,usecols=fields) # Tell pandas the median wage columns are really numbers # by getting rid of the , and telling Python the string is a float (decimal number) labor['Median Wage Early Career'] = labor['Median Wage Early Career'].str.replace(",","").astype(float) labor['Median Wage Mid-Career'] = labor['Median Wage Mid-Career'].str.replace(",","").astype(float) # statsmodel library needs column names not to have spaces # so we will rename the columns labor.columns = ['Major','Unemployment','Underemployment','Early','Mid','Share_grad'] # Set up the model and fit it (estimate the parameters) using our data. # For linear models, the formula is y ~ x # ols = ordinary least squares regression lm = smf.ols(formula = 'Mid ~ Early', data = labor).fit() # Print out the parameter values print(lm.params) # Make some predictions # First we make a new Data Frame, consisting of one column called 'Early', with the # independent variable values. new_early_wages = pd.DataFrame({'Early':[50000,55000,100000]}) # Make the prediction and print. In other words, we are predicting what the Median # Mid-Career Wage will be if the Median Early Career Wages are 50000, 55000, and 100000 print(lm.predict(new_early_wages)) # Plot regression line on the scatter plot using Seaborn sns.regplot(x = 'Early',y = 'Mid', data = labor)