print 'before: {:.2f} after'.format(1.5555)

before: 1.56 after

print '{1},{0},{1},{2},{0}'.format('pos',777,True)

777,pos,777,True,pos

print '{name},{age}'.format(age=18,name='cutie')

cutie,18

has=['first', 2.00, 'third']
print '1st {0[0]} all: {0} last {0[2]} end'.format(has)

1st first all: ['first', 2.0, 'third'] last third end

print 'start--- {:,} ---end'.format(9876543210)

start--- 9,876,543,210 ---end

print 'start:{:>8}'.format(123)

start:     123

print 'start:{:0>8}'.format(123)

start:00000123

print 'start:{:A>8}'.format(123)

start:AAAAA123

import matplotlib.pyplot as plt
import numpy as np

physical_sciences=[ 13.8,  14.9,  14.8,  16.5,  18.2,  19.1,  20. ,  21.3,  22.5,
        23.7,  24.6,  25.7,  27.3,  27.6,  28. ,  27.5,  28.4,  30.4,
        29.7,  31.3,  31.6,  32.6,  32.6,  33.6,  34.8,  35.9,  37.3,
        38.3,  39.7,  40.2,  41. ,  42.2,  41.1,  41.7,  42.1,  41.6,
        40.8,  40.7,  40.7,  40.7,  40.2,  40.1];
computer_science=[ 13.6,  13.6,  14.9,  16.4,  18.9,  19.8,  23.9,  25.7,  28.1,
        30.2,  32.5,  34.8,  36.3,  37.1,  36.8,  35.7,  34.7,  32.4,
        30.8,  29.9,  29.4,  28.7,  28.2,  28.5,  28.5,  27.5,  27.1,
        26.8,  27. ,  28.1,  27.7,  27.6,  27. ,  25.1,  22.2,  20.6,
        18.6,  17.6,  17.8,  18.1,  17.6,  18.2]
year=[1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011]

two methods to specify many distinct graphs¶

# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')

# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year, computer_science, color='red')

# Display the plot
plt.show()

# Create plot axes for the first line plot
plt.axes([0.05,0.05,0.425,0.9])

# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year,physical_sciences, color='blue')

# Create plot axes for the second line plot
plt.axes([.525,0.05,0.425,0.9])


# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year,computer_science, color='red')


# Display the plot
plt.show()

# Create a figure with 1x2 subplot and make the left subplot active
plt.subplot(1,2,1)

# Plot in blue the % of degrees awarded to women in the Physical Sciences
plt.plot(year, physical_sciences, color='blue')
plt.title('Physical Sciences')

# Make the right subplot active in the current 1x2 subplot grid
plt.subplot(1,2,2)


# Plot in red the % of degrees awarded to women in Computer Science
plt.plot(year, computer_science, color='red')
plt.title('Computer Science')

# Use plt.tight_layout() to improve the spacing between subplots
plt.tight_layout()
plt.show()

import numpy as np

from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

N = 4000
x = np.random.random(size=N) * 100
y = np.random.random(size=N) * 100
radii = np.random.random(size=N) * 1.5
colors = ["#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)]

output_notebook()

p = figure()

p.scatter(x, y, radius=radii,
          fill_color=colors, fill_alpha=0.6,
          line_color=None)

# output_file("color_scatter.html", title="color_scatter.py example")

show(p)  # open a browser

import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline
plt.rcParams['figure.figsize'] = (1.5, 1.5) # set default size of plots
# plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'

from sklearn.decomposition import PCA, RandomizedPCA, randomized_svd
from sklearn.cluster import KMeans
from sklearn.manifold import Isomap
from sklearn.model_selection import train_test_split, KFold

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train.shape,test.shape

((42000, 785), (28000, 784))

train.head(1)

test.head(1)

label=train.pop('label')

def fuckpca(train,test,n):
    start=time()
    pca=PCA(n_components=n,whiten=True)
    train=pca.fit_transform(train)
    test=pca.transform(test)
    print 'used {:.2f}s'.format(time()-start)
    return train,test

train_pca,test_pca=fuckpca(train,test,36)

used 4.97s

train_pca.shape,test_pca.shape

((42000L, 36L), (28000L, 36L))

plt.imshow(train_pca[3].reshape(6,-1))

<matplotlib.image.AxesImage at 0x1b99b358>

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

model=GradientBoostingClassifier(verbose=1,n_estimators=300)
model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=300, presort='auto', random_state=None,
              subsample=1.0, verbose=1, warm_start=False)

start=time()
model.fit(train_pca,label)
print 'used {:.2f}s'.format(time()-start)

      Iter       Train Loss   Remaining Time 
         1       79940.9682            6.62m
         2       70830.7802            6.62m
         3       64173.7553            6.61m
         4       59037.7864            6.58m
         5       54599.3908            6.58m
         6       50886.2372            6.57m
         7       47761.6971            6.55m
         8       45040.9071            6.54m
         9       42543.2014            6.55m
        10       40319.8099            6.56m
        20       27069.1592            6.42m
        30       20786.5203            6.28m
        40       17051.2103            6.03m
        50       14576.1295            5.82m
        60       12778.5881            5.59m
        70       11447.6600            5.35m
        80       10375.5170            5.12m
        90        9518.4201            4.88m
       100        8816.3706            4.64m
       200        5027.3031            2.26m
       300        3355.2391            0.00s
used 405.63s

result=model.predict(test_pca)

def save():
    import numpy as np
    submit=pd.DataFrame({'ImageId':np.arange(1,len(result)+1),'Label':result})
    submit.to_csv('gbc.csv',index=False)

# sub=pd.concat([pd.Series(np.arange(1,len(result)+1)),pd.Series(result)],axis=1)
# sub.columns=['ImageId','Label']

model.score(after,label)

0.9878095238095238

tr=model.predict(after)

(tr==label).sum()/float(label.shape[0])

0.9878095238095238

from sklearn.metrics import confusion_matrix

from sklearn.svm import SVC

svc=SVC(verbose=1)
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

train_pca=train
test_pca=test

start=time()
svc.fit(train_pca,label)
print 'used {:.2f}s'.format(time()-start)

[LibSVM]used 4429.63s

result=svc.predict(test_pca)

save()

from sklearn import svm,datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
iris.data[0:4]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2]])

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5, 10]}

model = svm.SVC()

classifier =GridSearchCV(model, parameters)

classifier.fit(iris.data, iris.target)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

classifier.best_params_

{'C': 1, 'kernel': 'linear'}

# classifier.cv_results_

classifier.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

import scipy

print scipy.stats.expon(scale=100)

<scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000008E57E10>

parameter_dist = {
  'C': scipy.stats.expon(scale=100),
  'kernel': ['linear'],
  'gamma': scipy.stats.expon(scale=.1),
}

classifier = grid_search.RandomizedSearchCV(model, parameter_dist)
classifier.fit(iris.data, iris.target)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'kernel': ['linear'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000008E79FD0>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000008E70080>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)

classifier.best_params_, classifier.best_score_

({'C': 1.3991944739478859, 'gamma': 0.0022802232812657304, 'kernel': 'linear'},
 0.9933333333333333)

wtf=scipy.stats.expon(scale=10)

print wtf.rvs(5)

[ 10.29516445   1.53962143   2.67578885   0.21101641   1.31133069]

wtf.rvs

python format print example

understand axis in matplotlib

two methods to specify many distinct graphs¶

Jupyter notebook into post

Data Science Notebook

python format print example

understand axis in matplotlib

two methods to specify many distinct graphs¶

Jupyter notebook into post