The pipeline workflow¶

● Repeatable way to go from raw data to trained model

● Pipeline object takes sequential list of steps

● Output of one step is input to next step

● Each step is a tuple with two elements

● Name: string

● Transform: obj implementing .fit() and .transform()

● Flexible: a step can itself be another pipeline!

dataset¶

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape

(400277, 25)

df.head(1)

df.Use.value_counts()

Instruction                    203608
NO_LABEL                        78712
O&M                             45868
ISPD                            26118
Pupil Services & Enrichment     23779
Leadership                      15715
Business Services                6120
Untracked Budget Set-Aside        357
Name: Use, dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400277 entries, 134338 to 415831
Data columns (total 25 columns):
Function                  400277 non-null object
Use                       400277 non-null object
Sharing                   400277 non-null object
Reporting                 400277 non-null object
Student_Type              400277 non-null object
Position_Type             400277 non-null object
Object_Type               400277 non-null object
Pre_K                     400277 non-null object
Operating_Status          400277 non-null object
Object_Description        375493 non-null object
Text_2                    88217 non-null object
SubFund_Description       306855 non-null object
Job_Title_Description     292743 non-null object
Text_3                    179964 non-null object
Text_4                    53746 non-null object
Sub_Object_Description    91603 non-null object
Location_Description      162054 non-null object
FTE                       126071 non-null float64
Function_Description      342195 non-null object
Facility_or_Department    53886 non-null object
Position_Extra            264764 non-null object
Total                     395722 non-null float64
Program_Description       304660 non-null object
Fund_Description          202877 non-null object
Text_1                    292285 non-null object
dtypes: float64(2), object(23)
memory usage: 79.4+ MB

df_numeric = df[['Total','FTE']]

split¶

nn

((300207, 2), (100070, 2), (300207, 8), (100070, 8))

y_train.head(3)

X_train.head(3)

build pipline¶

impute null value¶

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import Imputer
# Imputer default is "mean"
from time import time

pl = Pipeline([
    ('imp',Imputer()),
    ('clf',OneVsRestClassifier(LogisticRegression()))
])

start = time()
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)

used: 10.04s

accuracy = pl.score(X_test, y_test)
print 'accuracy: ', accuracy

accuracy:  0.0331268112321

Text features and feature unions¶

from sklearn.feature_extraction.text import CountVectorizer

text = df['Text_1']
text.fillna('Non Type', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(text, pd.get_dummies(df['Use']), random_state = 777)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((300207L,), (100070L,), (300207, 8), (100070, 8))

pl = Pipeline([
    ('vec',CountVectorizer()),
    ('clf',OneVsRestClassifier(LogisticRegression()))
])

start = time()
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)

used: 10.93s

accuracy = pl.score(X_test, y_test)
print 'accuracy: ', accuracy

accuracy:  0.681822724093

CountVecorizer explore¶

co = CountVectorizer()

text.shape

(400277L,)

vec = co.fit_transform(text)
vec

<400277x1572 sparse matrix of type '<type 'numpy.int64'>'
	with 874211 stored elements in Compressed Sparse Row format>

co.vocabulary_.items()[:30]

[(u'all', 58),
 (u'dist', 412),
 (u'sch', 1260),
 (u'keybank', 829),
 (u'consolidated', 315),
 (u'hats', 637),
 (u'extracurricular', 521),
 (u'foun', 565),
 (u'yellow', 1561),
 (u'facilities', 523),
 (u'disc', 409),
 (u'negl', 974),
 (u'alt', 66),
 (u'hate', 636),
 (u'children', 245),
 (u'chllnge', 246),
 (u'feeding', 539),
 (u'const', 317),
 (u'xtr', 1559),
 (u'aug', 115),
 (u'units', 1491),
 (u'tr', 1453),
 (u'to', 1446),
 (u'votec', 1521),
 (u'program', 1129),
 (u'teaching', 1422),
 (u'te', 1418),
 (u'spec', 1331),
 (u'activities', 25),
 (u'officemax', 1008)]

Preprocessing multiple dtypes¶

● Want to use all available features in one pipeline

● Problem

● Pipeline steps for numeric and text preprocessing can’t follow each other

● e.g., output of CountVectorizer can’t be input to Imputer

● Solution

● FunctionTransformer() & FeatureUnion()

FunctionTransformer¶

● Turns a Python function into an object that a scikit-learn pipeline can understand

● Need to write two functions for pipeline preprocessing

● Take entire DataFrame, return numeric columns

● Take entire DataFrame, return text columns

● Can then preprocess numeric and text data in separate pipelines

df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape

(400277, 25)

from sklearn.preprocessing import FunctionTransformer

get_text = FunctionTransformer(lambda x: x['Text_1'], validate=False)
get_numeric = FunctionTransformer(lambda x: x[['Total','FTE']], validate=False)

text_fillna = FunctionTransformer(lambda x: x.fillna('No class type'), validate=False)

FeatureUnion Text and Numeric Features¶

from sklearn.pipeline import FeatureUnion

numeric_pipeline = Pipeline([
    ('selector',get_numeric),
    ('imputer',Imputer())
])

text_pipeline = Pipeline([
    ('selector',get_text),
    ('fillna',text_fillna),
    ('vectorizer',CountVectorizer())
])

pl = Pipeline([
    ('Union',FeatureUnion([
        ('numeric',numeric_pipeline),
        ('text',text_pipeline)
    ])),
    ('clf',OneVsRestClassifier(LogisticRegression()))
])

pl

Pipeline(steps=[('Union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x0000000010BED4A8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ('...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

X_train, X_test, y_train, y_test = train_test_split(df, pd.get_dummies(df['Use']), random_state = 777)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((300207, 25), (100070, 25), (300207, 8), (100070, 8))

start = time()
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)

used: 18.10s

accuracy = pl.score(X_test, y_test)
print 'accuracy: ', accuracy

accuracy:  0.398491056261

Choosing a classification model¶

LABELS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type','Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']

NUMERIC_COLUMNS = ['Total','FTE']

NON_LABELS = [c for c in df.columns if c not in LABELS]
NON_LABELS

['Object_Description',
 'Text_2',
 'SubFund_Description',
 'Job_Title_Description',
 'Text_3',
 'Text_4',
 'Sub_Object_Description',
 'Location_Description',
 'FTE',
 'Function_Description',
 'Facility_or_Department',
 'Position_Extra',
 'Total',
 'Program_Description',
 'Fund_Description',
 'Text_1']

len(NON_LABELS) - len(NUMERIC_COLUMNS)

14

Using pipeline with the main dataset¶

import numpy as np
dummy_labels = pd.get_dummies(df[LABELS])
dummy_labels.shape

(400277, 104)

from multi_split import multilabel_train_test_split

X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS], dummy_labels, 0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((320222, 16), (80055, 16), (320222, 104), (80055, 104))

Flexibility of model step¶

● Is current model the best?

● Can quickly try different models with pipelines

● Pipeline preprocessing steps unchanged

● Edit the model step in your pipeline

● Random Forest, Naïve Bayes, k-NN

# Signature: combine_text_columns(data_frame, to_drop=['FTE', 'Total', 'Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status'])
# Source:
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv (no preprocessing necessary)
        :param to_drop (optional): Removes the numeric and label columns by default.
    """
    # drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # replace nans with blanks
    text_data.fillna("", inplace=True)
    
    # joins all of the text items in a row (axis=1)
    # with a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])

# Get the columns that are features in the original df
NON_LABELS = [c for c in df.columns if c not in LABELS]

# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)

# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

text_fillna = FunctionTransformer(lambda x: x.fillna('No class type'), validate=False)

# Complete the pipeline: pl
from sklearn.linear_model import SGDClassifier


pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('fillna',text_fillna),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', OneVsRestClassifier(SGDClassifier()))
    ])
pl

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x0000000013EC7898>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=Fa...r_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1))])

start = time()
# Fit to the training data
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)

used: 48.59s
('\nAccuracy on budget dataset: ', 2.4982824308288053e-05)

	Total	FTE
170488	109.11872	NaN
67535	-9.63000	NaN
16714	-470.04000	NaN

	NO_LABEL	Pupil Services & Enrichment
170488	0.0	1.0
67535	1.0	0.0
16714	1.0	0.0