In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier


# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [97]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 12 , 8 ) )
    
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax, alpha=.7 )
        ax.set_title( var_name+' Skew: ' + str( round( float( df[ var_name ].skew() ) , 2) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        
#         ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( ['counts'] , visible=True )
        
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col)
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))
In [60]:
df = pd.read_csv('mpg.csv')
print df.shape
df.head(2)
(398, 9)
Out[60]:
mpg cylinders displacement horsepower weight acceleration model_year origin name
0 18.0 8 307.0 130 3504 12.0 70 1 chevrolet chevelle malibu
1 15.0 8 350.0 165 3693 11.5 70 1 buick skylark 320
In [61]:
plot_histograms(df, ['mpg','model_year'], 2, 1)
In [73]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model_year      398 non-null int64
origin          398 non-null int64
name            398 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB
In [68]:
df['cylinders'].value_counts()
Out[68]:
4    204
8    103
6     84
3      4
5      3
Name: cylinders, dtype: int64
In [62]:
describe_more(df)
Out[62]:
Datatype Levels Variable
7 int64 3 origin
1 int64 5 cylinders
6 int64 13 model_year
2 float64 82 displacement
3 object 94 horsepower
5 float64 95 acceleration
0 float64 129 mpg
8 object 305 name
4 int64 351 weight
In [84]:
titanic = pd.read_csv('train.csv')
# Plot distributions of Age of passangers who survived or did not survive
plot_distribution( titanic , var = 'Age' , target = 'Survived' , row = 'Sex' )
In [83]:
plot_distribution(df,'weight', 'cylinders')
In [85]:
plot_correlation_map(df)
In [98]:
plot_categories(titanic, cat = 'Embarked' , target = 'Survived' )
In [99]:
plot_categories(df, cat = 'model_year' , target = 'cylinders' )
In [107]:
titanic.head(1)
X = titanic.iloc[:,2:]
Y = titanic['Survived']
In [123]:
for i in X:
    X[i]=X[i].fillna(X[i].median())
    if X[i].dtypes == 'object':
        X[i] = pd.factorize(X[i])[0]
        
X.head(1)
Out[123]:
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 3 0 0 22.0 1 0 0 7.25 -1 0
In [124]:
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Pclass      891 non-null int64
Name        891 non-null int32
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null int32
Fare        891 non-null float64
Cabin       891 non-null int32
Embarked    891 non-null int32
dtypes: float64(2), int32(5), int64(3)
memory usage: 52.3 KB
In [125]:
plot_variable_importance(X, Y)
1.0
In [ ]: