#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 24 13:50:48 2018

@author: megan
"""

import pandas as pd
import seaborn as sns   # the Seaborn library has a lot of commands for making beautiful graphs (https://seaborn.pydata.org)
import matplotlib.pyplot as plt

# Dataset downloaded from: https://www.newyorkfed.org/research/college-labor-market/college-labor-market_compare-majors.html
# To download: At bottom of page, click the link that says "Download Data EXCEL".  Open downloaded file in Excel and save as a CSV file.

# The CSV file has a lot of extra columns.  Instead of getting rid of these columns, we will only read in the columns we want.
# fields is a variable containing a list with the names of the columns we want.
fields = ['Major','Unemployment Rate','Underemployment Rate','Median Wage Early Career','Median Wage Mid-Career','Share with Graduate Degree']

# Read in the CSV file, skipping the first 13 rows and only using the columns listed in fields.
labor = pd.read_csv("labor-market-for-recent-college-grads.csv",skiprows =13, usecols=fields)

# Tell pandas that the median wage columns are really numbers, not strings.
# We do this by: i) replacing the comma in the number with nothing ("" is the empty string), and
# ii) telling Python that the string is a float (decimal number)
labor['Median Wage Early Career'] = labor['Median Wage Early Career'].str.replace(",","").astype(float)
labor['Median Wage Mid-Career'] = labor['Median Wage Mid-Career'].str.replace(",","").astype(float)

# Compute the correlation matrix of all numerical columns in the labor dataframe.
# Each entry in the matrix is the correlation between two of the columns in the dataframe labor
correlation_matrix = labor.corr()
# Print the correlation matrix to the screen
print(correlation_matrix)

# Displays the correlation matrix as a heatmap, in which each number is replaced by a color.
# This heatmap makes it easier to visualize the different correlations.
sns.heatmap(correlation_matrix)
plt.show()

# Make a scatter plot where the x values are from the "Median Wage Early Career" column and the 
# y values are from the "Median Wage Mid-Career" column.
# Each point in the scatter plot represents one of the majors (with the x and y values corresponding to that major)
labor.plot.scatter(x = "Median Wage Early Career",y = "Median Wage Mid-Career")