In [58]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
In [59]:
# pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
In [60]:
# get titanic & test csv files as a DataFrame
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# preview the data
train.head()
Out[60]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

VARIABLE DESCRIPTIONS:

feature discription
survival Survival (0 = No; 1 = Yes)
pclass Passenger Class(1 = 1st; 2 = 2nd; 3 = 3rd)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare
cabin Cabin
embarked Port of Embarkation(C = Cherbourg; Q = Queenstown; S = Southampton)
In [61]:
print train.info()
print("----------------------------")
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
In [62]:
# drop unnecessary columns, these columns won't be useful in analysis and prediction
train = train.drop(['PassengerId','Name','Ticket'], axis=1)
test = test.drop(['Name','Ticket'], axis=1)

plotting each features

Embarked

In [63]:
# Embarked

# only in titanic_df, fill the two missing values with the most occurred value, which is "S".
train["Embarked"] = train["Embarked"].fillna("S")

# plot
sns.factorplot('Embarked','Survived', data=train,size=4,aspect=3)
Out[63]:
<seaborn.axisgrid.FacetGrid at 0x1368f2b0>
In [64]:
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# sns.factorplot('Embarked',data=train,kind='count',order=['S','C','Q'],ax=axis1)
# sns.factorplot('Survived',hue="Embarked",data=train,kind='count',order=[1,0],ax=axis2)
sns.countplot(x='Embarked', data=train, ax=axis1, alpha=.8)
sns.countplot(x='Survived', hue="Embarked", data=train, order=[1,0], ax=axis2, alpha=.8)

# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = train[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()

print embark_perc
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3, alpha=.8)
  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x140c9f60>
In [65]:
# Either to consider Embarked column in predictions,
# and remove "S" dummy variable, 
# and leave "C" & "Q", since they seem to have a good rate for Survival.

# OR, don't create dummy variables for Embarked column, just drop it, 
# because logically, Embarked doesn't seem to be useful in prediction.

embark_dummies_titanic  = pd.get_dummies(train['Embarked'])
embark_dummies_titanic.drop(['S'], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True)

train = train.join(embark_dummies_titanic)
test  = test.join(embark_dummies_test)

train.drop(['Embarked'], axis=1,inplace=True)
test.drop(['Embarked'], axis=1,inplace=True)
In [66]:
train.head()
Out[66]:
Survived Pclass Sex Age SibSp Parch Fare Cabin C Q
0 0 3 male 22.0 1 0 7.2500 NaN 0.0 0.0
1 1 1 female 38.0 1 0 71.2833 C85 1.0 0.0
2 1 3 female 26.0 0 0 7.9250 NaN 0.0 0.0
3 1 1 female 35.0 1 0 53.1000 C123 0.0 0.0
4 0 3 male 35.0 0 0 8.0500 NaN 0.0 0.0

Fare

In [67]:
# Fare

# only for test_df, since there is a missing "Fare" values
test["Fare"].fillna(test["Fare"].median(), inplace=True)

# convert from float to int
train['Fare'] = train['Fare'].astype(int)
test['Fare'] = test['Fare'].astype(int)

# get fare for survived & didn't survive passengers 
fare_not_survived = train["Fare"][train["Survived"] == 0]
fare_survived     = train["Fare"][train["Survived"] == 1]

# get average and std for fare of survived/not survived passengers
avgerage_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare      = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])

# plot
train['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,50),alpha=.8)

avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False, alpha=.7)
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x14ed1d68>

Age

In [68]:
# Age 


# get average, std, and number of NaN values in titanic_df
average_age_titanic   = train["Age"].mean()
std_age_titanic       = train["Age"].std()
count_nan_age_titanic = train["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)
In [69]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')

# axis3.set_title('Original Age values - Test')
# axis4.set_title('New Age values - Test')


# plot original Age values
# NOTE: drop all null values, and convert to int
train['Age'].dropna().astype(int).hist(bins=70, ax=axis1, alpha=.8)
# test_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

# fill NaN values in Age column with random values generated
train["Age"][np.isnan(train["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2

# convert from float to int
train['Age'] = train['Age'].astype(int)
test['Age']  = test['Age'].astype(int)
        
# plot new Age Values
train['Age'].hist(bins=70, ax=axis2, alpha=.8)
# test_df['Age'].hist(bins=70, ax=axis4)
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x1361bba8>
In [70]:
# .... continue with plot Age column

# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()
Out[70]:
<seaborn.axisgrid.FacetGrid at 0x13ead320>
In [71]:
# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = train[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x1540e240>

Cabin

In [72]:
# Cabin
print train['Cabin'].isnull().sum(), train['Cabin'].shape[0]
print test['Cabin'].isnull().sum(), test['Cabin'].shape[0]

# It has a lot of NaN values, so it won't cause a remarkable impact on prediction
train.drop("Cabin",axis=1,inplace=True)
test.drop("Cabin",axis=1,inplace=True)
687 891
327 418

Family

In [73]:
train.head(1)
Out[73]:
Survived Pclass Sex Age SibSp Parch Fare C Q
0 0 3 male 22 1 0 7 0.0 0.0
In [74]:
# Family

# Instead of having two columns Parch & SibSp, 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
train['Family'] =  train["Parch"] + train["SibSp"]
train['Family'].loc[train['Family'] > 0] = 1
# train['Family'].loc[train['Family'] == 0] = 0

test['Family'] =  test["Parch"] + test["SibSp"]
test['Family'].loc[test['Family'] > 0] = 1
# test['Family'].loc[test['Family'] == 0] = 0

# drop Parch & SibSp
train = train.drop(['SibSp','Parch'], axis=1)
test = test.drop(['SibSp','Parch'], axis=1)

subplot

sharex

In [75]:
# plot
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(9,4))

# sns.factorplot('Family',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Family', data=train, order=[1,0], ax=axis1, alpha=.7)

# average of survived for those who had/didn't have any family member
family_perc = train[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2, alpha=.7)

axis1.set_xticklabels(["With Family","Alone"], rotation=0)
Out[75]:
[<matplotlib.text.Text at 0x13c81860>, <matplotlib.text.Text at 0x15881400>]

Sex

In [76]:
# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex
    
train['Person'] = train[['Age','Sex']].apply(get_person,axis=1)
test['Person']    = test[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
train.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)
In [77]:
print train['Person'].unique()
train.head(1)
['male' 'female' 'child']
Out[77]:
Survived Pclass Age Fare C Q Family Person
0 0 3 22 7 0.0 0.0 1 male
In [78]:
# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic  = pd.get_dummies(train['Person'])
person_dummies_titanic.columns = ['Child','Female','Male']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Child','Female','Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

train = train.join(person_dummies_titanic)
test = test.join(person_dummies_test)
In [79]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,4))

# sns.factorplot('Person',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Person', data=train, ax=axis1, alpha=.8)

# average of survived for each Person(male, female, or child)
person_perc = train[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=person_perc, ax=axis2, order=['male','female','child'], alpha=.8)

train.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)

Pclass

In [80]:
# Pclass

# sns.factorplot('Pclass',data=titanic_df,kind='count',order=[1,2,3])
sns.factorplot('Pclass','Survived',order=[1,2,3], data=train,size=3, aspect=2)
Out[80]:
<seaborn.axisgrid.FacetGrid at 0x15598630>
In [81]:
# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies_titanic  = pd.get_dummies(train['Pclass'])
pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)

pclass_dummies_test = pd.get_dummies(test['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

train.drop(['Pclass'],axis=1,inplace=True)
test.drop(['Pclass'],axis=1,inplace=True)

train = train.join(pclass_dummies_titanic)
test = test.join(pclass_dummies_test)

Create Models

In [86]:
train.head(1)
Out[86]:
Survived Age Fare C Q Family Child Female Class_1 Class_2
0 0 22 7 0.0 0.0 1 0.0 0.0 0.0 0.0
In [89]:
from sklearn.model_selection import train_test_split
# define training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(train.iloc[:,1:], train['Survived'], test_size=.25)
In [90]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
Out[90]:
((668, 9), (223, 9), (668L,), (223L,))
In [91]:
# Logistic Regression

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)
Out[91]:
0.80988023952095811
In [92]:
# Support Vector Machines

svc = SVC()

svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc.score(X_train, Y_train)
Out[92]:
0.8652694610778443
In [93]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
Out[93]:
0.9640718562874252
In [94]:
knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(X_train, Y_train)

Y_pred = knn.predict(X_test)

knn.score(X_train, Y_train)
Out[94]:
0.82485029940119758
In [96]:
# Gaussian Naive Bayes

gaussian = GaussianNB()

gaussian.fit(X_train, Y_train)

Y_pred = gaussian.predict(X_test)

gaussian.score(X_train, Y_train)
Out[96]:
0.7859281437125748
In [101]:
train.columns.delete(0)
Out[101]:
Index([u'Age', u'Fare', u'C', u'Q', u'Family', u'Child', u'Female', u'Class_1',
       u'Class_2'],
      dtype='object')

get Correlation Coefficient for each feature using Logistic Regression

In [105]:
# get Correlation Coefficient for each feature using Logistic Regression
coeff_df = pd.DataFrame(train.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df
Out[105]:
Features Coefficient Estimate
0 Age -0.033095
1 Fare 0.001018
2 C 0.646655
3 Q 0.329587
4 Family -0.221548
5 Child 1.189572
6 Female 2.579748
7 Class_1 2.015902
8 Class_2 1.095272

choose # Random Forests

In [111]:
train.head(1)
Out[111]:
Survived Age Fare C Q Family Child Female Class_1 Class_2
0 0 22 7 0.0 0.0 1 0.0 0.0 0.0 0.0
In [112]:
test.head(1)
Out[112]:
PassengerId Age Fare C Q Family Child Female Class_1 Class_2
0 892 34 7 0.0 1.0 0 0.0 0.0 0.0 0.0
In [110]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(train.iloc[:,1:], train['Survived'])

Y_pred = random_forest.predict(test.drop('PassengerId', axis=1))

Y_pred.shape
Out[110]:
(418L,)
In [115]:
Y_pred[:10]
Out[115]:
array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int64)
In [116]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('titanic_submit.csv', index=False)
In [ ]: