# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns
# Set default Seaborn style
sns.set()
from sklearn.datasets import load_iris
data = load_iris()
type(data)
data.target_names
data.data.shape
data.target
versicolor = data.data[data.target==1]
versicolor.shape
data.feature_names
versicolor_petal_lengths = versicolor[:,2]
print versicolor_petal_lengths.shape
versicolor_petal_lengths
Plot a histogram of the petal lengths of his 50 samples of Iris versicolor using matplotlib/seaborn's default settings. Recall that to specify the default seaborn style, you can use sns.set(), where sns is the alias that seaborn is imported as.
The subset of the data set containing the Iris versicolor petal lengths in units of centimeters (cm) is stored in the NumPy array versicolor_petal_length.
Also, Justin assigned his plotting statements (except for plt.show()) to the dummy variable _. This is to prevent unnecessary output from being displayed.
# Import numpy
import numpy as np
# Compute number of data points: n_data
n_data = len(versicolor_petal_lengths)
print n_data
# Number of bins is the square root of number of data points: n_bins
n_bins = np.sqrt(n_data)
print n_bins
# Convert number of bins to integer: n_bins
n_bins = int(n_bins)
print n_bins
# Plot histogram of versicolor petal lengths
_ = plt.hist(versicolor_petal_lengths, bins= n_bins, ec='black')
# Label axes
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('count')
# Show histogram
plt.show()
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
# save load_iris() sklearn dataset to iris
# if you'd like to check dataset type use: type(load_iris())
# if you'd like to view list of attributes use: dir(load_iris())
iris = load_iris()
# np.c_ is the numpy concatenate function
# which is used to concat iris['data'] and iris['target'] arrays
# for pandas column argument: concat iris['feature_names'] list
# and string list (in this case one string); you can make this anything you'd like..
# the original dataset would probably call this ['Species']
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
df.shape
np.c_[np.array([1,2,3]), np.array([4,5,6])]
np.stack([[1,2,3],[4,5,6]], 1)
df.head()
iris.target_names
df['species']=df['target'].map({0:'setosa',1:'versicolor',2:'virginica'})
df.head()
Make a bee swarm plot of the iris petal lengths. Your x-axis should contain each of the three species, and the y-axis the petal lengths.
# Create bee swarm plot with Seaborn's default settings
_ = sns.swarmplot(x='species', y='petal length (cm)', data=df)
# Label the axes
_ = plt.xlabel('species')
_ = plt.ylabel('petal length (cm)')
# Show the plot
plt.show()
a function that takes as input a 1D array of data and then returns the x and y values of the ECDF
https://en.wikipedia.org/wiki/Empirical_distribution_function
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / float(n)
return x, y
ecdf([1,2,3,4])
ecdf(versicolor_petal_lengths)
# Compute ECDF for versicolor data: x_vers, y_vers
x_vers, y_vers = ecdf(versicolor_petal_lengths)
# Generate plot
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
# Make the margins nice
plt.margins(0.02)
# Label the axes
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')
# Display the plot
plt.show()
df.head()
# Compute ECDFs
plt.figure(figsize=((15,5)))
x_set, y_set = ecdf(df.iloc[:,0][df['target']==0])
x_vers, y_vers = ecdf(df.iloc[:,1][df['target']==1])
x_virg, y_virg = ecdf(df.iloc[:,2][df['target']==2])
# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')
# Make nice margins
plt.margins(0.02)
# Annotate the plot
_ = plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')
# Display the plot
plt.show()
df = pd.read_csv('./2008_all_states.csv')
df.shape
df.tail()
df['state'].value_counts().head()
df.pivot_table(index='east_west',values='total_votes', aggfunc='mean')
df.pivot_table(index='state',values='total_votes', aggfunc='mean').tail()
df.head()
swing = df[df['state'].apply(lambda x: x in ['PA','OH','IL'])]
swing.shape
df_swing = swing
_ = plt.hist(df_swing['dem_share'])
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('number of counties')
plt.show()
_ = sns.swarmplot(x='state', y='dem_share', data=df_swing)
_ = plt.xlabel('state')
_ = plt.ylabel('percent of vote for Obama')
plt.show()
import numpy as np
plt.figure(figsize=[16,5])
x = np.sort(df_swing['dem_share'])
y = np.arange(1, len(x)+1) / float(len(x))
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()
swing['state'].unique()
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / float(n)
return x, y
plt.plot?
plt.figure(figsize=[16,5])
x1, y1 = ecdf(swing[swing['state']=='IL']['dem_share'])
x2, y2 = ecdf(swing[swing['state']=='PA']['dem_share'])
x3, y3 = ecdf(swing[swing['state']=='OH']['dem_share'])
_ = plt.plot(x1, y1, marker='.', linestyle='none')
_ = plt.plot(x2, y2, marker='.', linestyle='none')
_ = plt.plot(x3, y3, marker='.', linestyle='none')
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()