import numpy as np

from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

N = 4000
x = np.random.random(size=N) * 100
y = np.random.random(size=N) * 100
radii = np.random.random(size=N) * 1.5
colors = ["#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)]

output_notebook()

p = figure()

p.scatter(x, y, radius=radii,
          fill_color=colors, fill_alpha=0.6,
          line_color=None)

# output_file("color_scatter.html", title="color_scatter.py example")

show(p)  # open a browser

import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
plt.rcParams['figure.figsize'] = (1.5, 1.5) # set default size of plots
# plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'

from sklearn.decomposition import PCA, RandomizedPCA, randomized_svd
from sklearn.cluster import KMeans
from sklearn.manifold import Isomap
from sklearn.model_selection import train_test_split, KFold

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train.shape,test.shape

((42000, 785), (28000, 784))

train.head(1)

test.head(1)

label=train.pop('label')

def fuckpca(train,test,n):
    start=time()
    pca=PCA(n_components=n,whiten=True)
    train=pca.fit_transform(train)
    test=pca.transform(test)
    print 'used {:.2f}s'.format(time()-start)
    return train,test

train_pca,test_pca=fuckpca(train,test,36)

used 4.97s

train_pca.shape,test_pca.shape

((42000L, 36L), (28000L, 36L))

plt.imshow(train_pca[3].reshape(6,-1))

<matplotlib.image.AxesImage at 0x1b99b358>

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

model=GradientBoostingClassifier(verbose=1,n_estimators=300)
model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=300, presort='auto', random_state=None,
              subsample=1.0, verbose=1, warm_start=False)

start=time()
model.fit(train_pca,label)
print 'used {:.2f}s'.format(time()-start)

      Iter       Train Loss   Remaining Time 
         1       79940.9682            6.62m
         2       70830.7802            6.62m
         3       64173.7553            6.61m
         4       59037.7864            6.58m
         5       54599.3908            6.58m
         6       50886.2372            6.57m
         7       47761.6971            6.55m
         8       45040.9071            6.54m
         9       42543.2014            6.55m
        10       40319.8099            6.56m
        20       27069.1592            6.42m
        30       20786.5203            6.28m
        40       17051.2103            6.03m
        50       14576.1295            5.82m
        60       12778.5881            5.59m
        70       11447.6600            5.35m
        80       10375.5170            5.12m
        90        9518.4201            4.88m
       100        8816.3706            4.64m
       200        5027.3031            2.26m
       300        3355.2391            0.00s
used 405.63s

result=model.predict(test_pca)

def save():
    import numpy as np
    submit=pd.DataFrame({'ImageId':np.arange(1,len(result)+1),'Label':result})
    submit.to_csv('gbc.csv',index=False)

# sub=pd.concat([pd.Series(np.arange(1,len(result)+1)),pd.Series(result)],axis=1)
# sub.columns=['ImageId','Label']

model.score(after,label)

0.9878095238095238

tr=model.predict(after)

(tr==label).sum()/float(label.shape[0])

0.9878095238095238

from sklearn.metrics import confusion_matrix

from sklearn.svm import SVC

svc=SVC(verbose=1)
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

train_pca=train
test_pca=test

start=time()
svc.fit(train_pca,label)
print 'used {:.2f}s'.format(time()-start)

[LibSVM]used 4429.63s

result=svc.predict(test_pca)

save()

from sklearn import svm,datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
iris.data[0:4]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2]])

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5, 10]}

model = svm.SVC()

classifier =GridSearchCV(model, parameters)

classifier.fit(iris.data, iris.target)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

classifier.best_params_

{'C': 1, 'kernel': 'linear'}

# classifier.cv_results_

classifier.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

import scipy

print scipy.stats.expon(scale=100)

<scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000008E57E10>

parameter_dist = {
  'C': scipy.stats.expon(scale=100),
  'kernel': ['linear'],
  'gamma': scipy.stats.expon(scale=.1),
}

classifier = grid_search.RandomizedSearchCV(model, parameter_dist)
classifier.fit(iris.data, iris.target)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'kernel': ['linear'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000008E79FD0>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000008E70080>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)

classifier.best_params_, classifier.best_score_

({'C': 1.3991944739478859, 'gamma': 0.0022802232812657304, 'kernel': 'linear'},
 0.9933333333333333)

wtf=scipy.stats.expon(scale=10)

print wtf.rvs(5)

[ 10.29516445   1.53962143   2.67578885   0.21101641   1.31133069]

wtf.rvs

Data Science Notebook

Jupyter notebook into post