In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
%matplotlib inline
In [26]:
df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape
Out[26]:
(400277, 25)
In [27]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 400277 entries, 134338 to 415831
Data columns (total 25 columns):
Function                  400277 non-null object
Use                       400277 non-null object
Sharing                   400277 non-null object
Reporting                 400277 non-null object
Student_Type              400277 non-null object
Position_Type             400277 non-null object
Object_Type               400277 non-null object
Pre_K                     400277 non-null object
Operating_Status          400277 non-null object
Object_Description        375493 non-null object
Text_2                    88217 non-null object
SubFund_Description       306855 non-null object
Job_Title_Description     292743 non-null object
Text_3                    179964 non-null object
Text_4                    53746 non-null object
Sub_Object_Description    91603 non-null object
Location_Description      162054 non-null object
FTE                       126071 non-null float64
Function_Description      342195 non-null object
Facility_or_Department    53886 non-null object
Position_Extra            264764 non-null object
Total                     395722 non-null float64
Program_Description       304660 non-null object
Fund_Description          202877 non-null object
Text_1                    292285 non-null object
dtypes: float64(2), object(23)
memory usage: 79.4+ MB
In [28]:
df.columns
Out[28]:
Index([u'Function', u'Use', u'Sharing', u'Reporting', u'Student_Type',
       u'Position_Type', u'Object_Type', u'Pre_K', u'Operating_Status',
       u'Object_Description', u'Text_2', u'SubFund_Description',
       u'Job_Title_Description', u'Text_3', u'Text_4',
       u'Sub_Object_Description', u'Location_Description', u'FTE',
       u'Function_Description', u'Facility_or_Department', u'Position_Extra',
       u'Total', u'Program_Description', u'Fund_Description', u'Text_1'],
      dtype='object')
In [31]:
df.head()
Out[31]:
Function Use Sharing Reporting Student_Type Position_Type Object_Type Pre_K Operating_Status Object_Description ... Sub_Object_Description Location_Description FTE Function_Description Facility_or_Department Position_Extra Total Program_Description Fund_Description Text_1
134338 Teacher Compensation Instruction School Reported School NO_LABEL Teacher NO_LABEL NO_LABEL PreK-12 Operating NaN ... NaN NaN 1.0 NaN NaN KINDERGARTEN 50471.810 KINDERGARTEN General Fund NaN
206341 NO_LABEL NO_LABEL NO_LABEL NO_LABEL NO_LABEL NO_LABEL NO_LABEL NO_LABEL Non-Operating CONTRACTOR SERVICES ... NaN NaN NaN RGN GOB NaN UNDESIGNATED 3477.860 BUILDING IMPROVEMENT SERVICES NaN BUILDING IMPROVEMENT SERVICES
326408 Teacher Compensation Instruction School Reported School Unspecified Teacher Base Salary/Compensation Non PreK PreK-12 Operating Personal Services - Teachers ... NaN NaN 1.0 NaN NaN TEACHER 62237.130 Instruction - Regular General Purpose School NaN
364634 Substitute Compensation Instruction School Reported School Unspecified Substitute Benefits NO_LABEL PreK-12 Operating EMPLOYEE BENEFITS ... NaN NaN NaN UNALLOC BUDGETS/SCHOOLS NaN PROFESSIONAL-INSTRUCTIONAL 22.300 GENERAL MIDDLE/JUNIOR HIGH SCH NaN REGULAR INSTRUCTION
47683 Substitute Compensation Instruction School Reported School Unspecified Teacher Substitute Compensation NO_LABEL PreK-12 Operating TEACHER COVERAGE FOR TEACHER ... NaN NaN NaN NON-PROJECT NaN PROFESSIONAL-INSTRUCTIONAL 54.166 GENERAL HIGH SCHOOL EDUCATION NaN REGULAR INSTRUCTION

5 rows × 25 columns

In [30]:
# df.columns = ['ID']+list(df.columns[1:])
In [36]:
df['Job_Title_Description'].value_counts().head(10)
Out[36]:
Teacher, Elementary               30939
Teacher, Short Term Sub           23450
(blank)                           15235
Teacher, Secondary (High)          8994
Teacher,Retrd Shrt Term Sub        8746
TEACHER, REGULAR                   8517
TEACHER SUBSTITUTE POOL            7115
SUB TEACHER ALL                    7018
Teacher Secondary (Middle)         6912
Teacher                            5033
Name: Job_Title_Description, dtype: int64
In [43]:
df['Object_Type'].value_counts().head(10)
Out[43]:
Base Salary/Compensation       97670
Benefits                       85467
NO_LABEL                       69644
Other Compensation/Stipend     61685
Supplies/Materials             31935
Substitute Compensation        27357
Contracted Services             7512
Other Non-Compensation          6297
Travel & Conferences            5030
Equipment & Equipment Lease     4460
Name: Object_Type, dtype: int64

columns

  • FTE: Stands for "full-time equivalent". If the budget item is associated to an employee, this number tells us the percentage of full-time that the employee works. A value of 1 means the associated employee works for the schooll full-time. A value close to 0 means the item is associated to a part-time or contracted employee.
  • Total: Stands for the total cost of the expenditure. This number tells us how much the budget item cost.
In [48]:
df['FTE'].value_counts().head()
Out[48]:
1.000000    35788
0.000000    31338
0.004310     8130
0.002155     2735
0.008621     2293
Name: FTE, dtype: int64
In [50]:
df['Total'].describe()
Out[50]:
count    3.957220e+05
mean     1.310586e+04
std      3.682254e+05
min     -8.746631e+07
25%               NaN
50%               NaN
75%               NaN
max      1.297000e+08
Name: Total, dtype: float64
In [58]:
plt.xlim?
In [67]:
# Create the histogram
plt.hist(df['FTE'].dropna(),bins=500,alpha=.5,color='r')
plt.xlim([-.2,1.2])

# Add title and labels
plt.title('Distribution of %full-time \n employee works')
plt.xlabel('% of full-time')
plt.ylabel('num employees')

# Display the histogram
plt.show()

datatypes

Objects instead of categories

Encode labels as categories

● ML algorithms work on numbers, not strings

● Need a numeric representation of these strings

● Strings can be slow compared to numbers

● In pandas, ‘category’ dtype encodes categorical data numerically

● Can speed up code

In [93]:
# sample_df.label = sample_df.label.astype('category')

Dummy variable encoding

In [71]:
df.head(1)
Out[71]:
Function Use Sharing Reporting Student_Type Position_Type Object_Type Pre_K Operating_Status Object_Description ... Sub_Object_Description Location_Description FTE Function_Description Facility_or_Department Position_Extra Total Program_Description Fund_Description Text_1
134338 Teacher Compensation Instruction School Reported School NO_LABEL Teacher NO_LABEL NO_LABEL PreK-12 Operating NaN ... NaN NaN 1.0 NaN NaN KINDERGARTEN 50471.81 KINDERGARTEN General Fund NaN

1 rows × 25 columns

In [70]:
df.dtypes
Out[70]:
Function                   object
Use                        object
Sharing                    object
Reporting                  object
Student_Type               object
Position_Type              object
Object_Type                object
Pre_K                      object
Operating_Status           object
Object_Description         object
Text_2                     object
SubFund_Description        object
Job_Title_Description      object
Text_3                     object
Text_4                     object
Sub_Object_Description     object
Location_Description       object
FTE                       float64
Function_Description       object
Facility_or_Department     object
Position_Extra             object
Total                     float64
Program_Description        object
Fund_Description           object
Text_1                     object
dtype: object
In [75]:
df['Use'].value_counts()
Out[75]:
Instruction                    203608
NO_LABEL                        78712
O&M                             45868
ISPD                            26118
Pupil Services & Enrichment     23779
Leadership                      15715
Business Services                6120
Untracked Budget Set-Aside        357
Name: Use, dtype: int64
In [79]:
type(df['Use']), type(df[['Use']])
Out[79]:
(pandas.core.series.Series, pandas.core.frame.DataFrame)
In [81]:
dummies = pd.get_dummies(df[['Use']], prefix_sep='_')
dummies.head()
Out[81]:
Use_Business Services Use_ISPD Use_Instruction Use_Leadership Use_NO_LABEL Use_O&M Use_Pupil Services & Enrichment Use_Untracked Budget Set-Aside
134338 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
206341 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
326408 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
364634 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
47683 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [82]:
dummies = pd.get_dummies(df['Use'], prefix_sep='_')
dummies.head()
Out[82]:
Business Services ISPD Instruction Leadership NO_LABEL O&M Pupil Services & Enrichment Untracked Budget Set-Aside
134338 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
206341 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
326408 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
364634 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
47683 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0

Lambda functions

In [84]:
categorize_label = lambda x: x.astype('category')
In [85]:
type(categorize_label)
Out[85]:
function
In [92]:
df_use = df[['Use']].apply(categorize_label, axis=0)
df_use.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 400277 entries, 134338 to 415831
Data columns (total 1 columns):
Use    400277 non-null category
dtypes: category(1)
memory usage: 3.4 MB
In [96]:
df.Function.astype('category').dtypes
Out[96]:
category
In [104]:
df.apply(categorize_label, axis=0).dtypes
Out[104]:
Function                  category
Use                       category
Sharing                   category
Reporting                 category
Student_Type              category
Position_Type             category
Object_Type               category
Pre_K                     category
Operating_Status          category
Object_Description        category
Text_2                    category
SubFund_Description       category
Job_Title_Description     category
Text_3                    category
Text_4                    category
Sub_Object_Description    category
Location_Description      category
FTE                       category
Function_Description      category
Facility_or_Department    category
Position_Extra            category
Total                     category
Program_Description       category
Fund_Description          category
Text_1                    category
dtype: object
In [103]:
df.dtypes.value_counts()
Out[103]:
object     23
float64     2
dtype: int64
In [102]:
df.apply(categorize_label).dtypes.value_counts()
Out[102]:
category    25
dtype: int64

Counting unique labels

In [109]:
# pd.Series.nunique?
# Return number of unique elements in the object.
In [115]:
df['Function'].nunique()
Out[115]:
37
In [122]:
for i in df: print i, '--- unique values: ', df[i].nunique()
Function --- unique values:  37
Use --- unique values:  8
Sharing --- unique values:  5
Reporting --- unique values:  3
Student_Type --- unique values:  9
Position_Type --- unique values:  25
Object_Type --- unique values:  11
Pre_K --- unique values:  3
Operating_Status --- unique values:  3
Object_Description --- unique values:  602
Text_2 --- unique values:  301
SubFund_Description --- unique values:  274
Job_Title_Description --- unique values:  3516
Text_3 --- unique values:  35
Text_4 --- unique values:  244
Sub_Object_Description --- unique values:  182
Location_Description --- unique values:  354
FTE --- unique values:  21003
Function_Description --- unique values:  687
Facility_or_Department --- unique values:  179
Position_Extra --- unique values:  580
Total --- unique values:  286494
Program_Description --- unique values:  421
Fund_Description --- unique values:  141
Text_1 --- unique values:  1423
In [123]:
# Calculate number of unique values for each label: num_unique_labels
num_unique_labels = df.apply(pd.Series.nunique)
num_unique_labels
Out[123]:
Function                      37
Use                            8
Sharing                        5
Reporting                      3
Student_Type                   9
Position_Type                 25
Object_Type                   11
Pre_K                          3
Operating_Status               3
Object_Description           602
Text_2                       301
SubFund_Description          274
Job_Title_Description       3516
Text_3                        35
Text_4                       244
Sub_Object_Description       182
Location_Description         354
FTE                        21003
Function_Description         687
Facility_or_Department       179
Position_Extra               580
Total                     286494
Program_Description          421
Fund_Description             141
Text_1                      1423
dtype: int64
In [131]:
# Plot number of unique values for each label
plt.figure(figsize=[10,5])
num_unique_labels.plot(kind='bar',alpha=.6)

plt.ylim([0,2000])
# Label the axes
plt.xlabel('Labels')
plt.ylabel('Number of unique values')

# Display the plot
plt.show()
In [145]:
# np.clip?

# np.clip(a, a_min, a_max, out=None)

# Docstring:
# Clip (limit) the values in an array.

# Given an interval, values outside the interval are clipped to
# the interval edges.  For example, if an interval of ``[0, 1]``
# is specified, values smaller than 0 become 0, and values larger
# than 1 become 1.
In [143]:
np.clip(range(1,10), 3, 7)
Out[143]:
array([3, 3, 3, 4, 5, 6, 7, 7, 7])

Log loss binary classification

● Log loss for binary classification

● Actual value: y = {1=yes, 0=no}

● Prediction (probability that the value is 1): p

logloss(N=1) = y log(p) + (1 − y) log(1 − p)

In [6]:
def compute_log_loss(predicted, actual, eps=1e-14):
    import numpy as np
    """ Computes the logarithmic loss between predicted and
    actual when these are 1D arrays.
    :param predicted: The predicted probabilities as floats between 0-1
    :param actual: The actual binary labels. Either 0 or 1.
    :param eps (optional): log(0) is inf, so we need to offset our
    predicted values slightly by eps from 0 or 1.
    """
    predicted = np.clip(predicted, eps, 1 - eps)
    
    loss = -1 * np.mean(actual * np.log(predicted) + (1 - actual)* np.log(1 - predicted))
    
    return loss
In [7]:
compute_log_loss(predicted=0.9, actual=0)
Out[7]:
2.3025850929940459
In [8]:
compute_log_loss(predicted=0.5, actual=1)
Out[8]:
0.69314718055994529
In [ ]: