import pandas as pd
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
%matplotlib inline
from data import points, new_points
print points.shape, new_points.shape
plt.figure(figsize=(6,8))
plt.subplot(2,1,1)
plt.scatter(points[:,0], points[:,1], alpha=.6, c='r')
plt.xlim([-3,3])
plt.subplot(2,1,2)
plt.scatter(new_points[:,0], new_points[:,1], alpha=.6, c='g')
# Import KMeans
from sklearn.cluster import KMeans
# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=3)
# Fit model to points
model.fit(points)
# Determine the cluster labels of new_points: labels
labels = model.predict(new_points)
# Print cluster labels of new_points
print(labels)
model.cluster_centers_
plt.figure(figsize=(8,5))
# Assign the columns of new_points: xs and ys
xs = new_points[:,0]
ys = new_points[:,1]
# Make a scatter plot of xs and ys, using labels to define the colors
plt.scatter(xs, ys, c=labels, alpha=0.5)
# Assign the cluster centers: centroids
centroids = model.cluster_centers_
# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]
# Make a scatter plot of centroids_x and centroids_y
plt.scatter(centroids_x, centroids_y, marker='D', s=50)
plt.show()
from sklearn import datasets
iris = datasets.load_iris()
print type(iris.data)
iris.data.shape
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target']=iris.target
print iris.target_names
df['species'] = df['target'].map({0:iris.target_names[0],1:iris.target_names[1],2:iris.target_names[2]})
df.head()
samples = df.iloc[:,:4]
samples.head()
from sklearn.cluster import KMeans
model2 = KMeans(n_clusters=3)
model.fit(samples)
labels2 = model.predict(samples)
labels2
plt.scatter(samples.iloc[:,0], samples.iloc[:,1], c=labels2, alpha = .7)
pd.crosstab(df['target'], df['species'])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:4],df.species, test_size = .33, random_state=7)
print x_train.shape, x_test.shape, y_train.shape, y_test.shape
x_train.head(3)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
# Fit model to points
model.fit(x_train,y_train)
# Determine the cluster labels of new_points: labels
labels = model.predict(x_test)
# Print cluster labels of new_points
print(labels)
pd.crosstab(labels, y_test)
# Import KMeans
from sklearn.cluster import KMeans
# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=3)
# Fit model to points
model.fit(df.iloc[:,:4])
# Determine the cluster labels of new_points: labels
labels = model.predict(df.iloc[:,:4])
# Print cluster labels of new_points
print(labels)
pd.crosstab(labels, df.species)
print(model.inertia_)
from sklearn.decomposition import PCA
pca = PCA(2)
pca
df_pca = pca.fit_transform(df.iloc[:,:4])
df_pca.shape
plt.scatter(df_pca[:,0], df_pca[:,1],c = labels)
model.score(df.iloc[:,:4])
ks = range(1, 6)
inertias = []
for k in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=k)
# Fit model to samples
model.fit(df.iloc[:,:4])
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
df.describe()
df.var()
import numpy as np
np.sqrt(df.var())
# Perform the necessary imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Create scaler: scaler
scaler = StandardScaler()
# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=3)
# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)
names = '''Alcohol , Malic acid , Ash , Alcalinity of ash , Magnesium ,
Total phenols , Flavanoids , Nonflavanoid phenols , Proanthocyanins ,Color intensity ,Hue ,OD280/OD315 of diluted wines ,Proline '''
names = names.split(',')
wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',names = names)
wine.info()
wine.var()
classes = [59,71,48]
wine = wine.reset_index()
del wine['index']
wine['class'] = ['Barolo' for i in range(59)]+ ['Grignolino' for i in range(71)] + ['Barbera' for i in range(48)]
wine['class'].value_counts()
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
labels = model.fit_predict(wine.iloc[:,:-1])
pd.crosstab(labels, wine['class'])
wine.var()
plt.scatter(wine.iloc[:,-3], wine.iloc[:,-2],
c= wine.iloc[:,-1].map({'Barbera':'r', 'Barolo':'g', 'Grignolino':'b'}),
alpha=.7)
plt.axis('equal')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
wine.iloc[:,:-1] = scaler.fit_transform(wine.iloc[:,:-1])
plt.scatter(wine.iloc[:,-3], wine.iloc[:,-2],
c= wine.iloc[:,-1].map({'Barbera':'r', 'Barolo':'g', 'Grignolino':'b'}),
alpha=.7)
plt.axis('equal')
In [1]: from sklearn.preprocessing import StandardScaler
In [2]: scaler = StandardScaler()
In [3]: scaler.fit(samples)
Out[3]: StandardScaler(copy=True, with_mean=True, with_std=True)
In [4]: samples_scaled = scaler.transform(samples)
● StandardScaler and KMeans have similar methods
● Use fit() / transform() with StandardScaler
● Use fit() / predict() with KMeans
● Need to perform two steps: StandardScaler, then KMeans
● Use sklearn pipeline to combine multiple steps
● Data flows from one step into the next
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
scaler = StandardScaler()
kmeans = KMeans(n_clusters=3)
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(scaler, kmeans)
pipe
pipe.fit(wine.iloc[:,:-1])
labels = pipe.predict(wine.iloc[:,:-1])
labels
pd.crosstab(labels, wine['class'])
● StandardScaler is a "preprocessing" step
● MaxAbsScaler and Normalizer are other examples
Note that Normalizer() is different to StandardScaler(), which you used in the previous exercise. While StandardScaler() standardizes features (such as the features of the fish data from the previous exercise) by removing the mean and scaling to unit variance, Normalizer() rescales each sample - here, each company's stock price - independently of the other.