#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Mar 24 13:50:48 2018 @author: megan """ import pandas as pd import seaborn as sns # the Seaborn library has a lot of commands for making beautiful graphs (https://seaborn.pydata.org) import matplotlib.pyplot as plt # Dataset downloaded from: https://www.newyorkfed.org/research/college-labor-market/college-labor-market_compare-majors.html # To download: At bottom of page, click the link that says "Download Data EXCEL". Open downloaded file in Excel and save as a CSV file. # The CSV file has a lot of extra columns. Instead of getting rid of these columns, we will only read in the columns we want. # fields is a variable containing a list with the names of the columns we want. fields = ['Major','Unemployment Rate','Underemployment Rate','Median Wage Early Career','Median Wage Mid-Career','Share with Graduate Degree'] # Read in the CSV file, skipping the first 13 rows and only using the columns listed in fields. labor = pd.read_csv("labor-market-for-recent-college-grads.csv",skiprows =13, usecols=fields) # Tell pandas that the median wage columns are really numbers, not strings. # We do this by: i) replacing the comma in the number with nothing ("" is the empty string), and # ii) telling Python that the string is a float (decimal number) labor['Median Wage Early Career'] = labor['Median Wage Early Career'].str.replace(",","").astype(float) labor['Median Wage Mid-Career'] = labor['Median Wage Mid-Career'].str.replace(",","").astype(float) # Compute the correlation matrix of all numerical columns in the labor dataframe. # Each entry in the matrix is the correlation between two of the columns in the dataframe labor correlation_matrix = labor.corr() # Print the correlation matrix to the screen print(correlation_matrix) # Displays the correlation matrix as a heatmap, in which each number is replaced by a color. # This heatmap makes it easier to visualize the different correlations. sns.heatmap(correlation_matrix) plt.show() # Make a scatter plot where the x values are from the "Median Wage Early Career" column and the # y values are from the "Median Wage Mid-Career" column. # Each point in the scatter plot represents one of the majors (with the x and y values corresponding to that major) labor.plot.scatter(x = "Median Wage Early Career",y = "Median Wage Mid-Career")