predict ( train_feature ) # gives average distance values for each cluster solution # cdist calculates distance of each two points from centriod # get the min distance (where point is placed in clsuter) # get average distance by summing & dividing by total number of samples meandist. fit ( train_feature ) # test the model clusassign = model. shape ( 120, 4 ) ( 30, 4 ) # A LOOK AT THE MODEL # KMeans ( n_clusters = 2 ) KMeans ( copy_x = True, init = 'k-means++', max_iter = 300, n_clusters = 2, n_init = 10, n_jobs = 1, precompute_distances = 'auto', random_state = None, tol = 0.0001, verbose = 0 ) # ELBOW CHART TO DETERMINE OPTIMUM K # from import cdist import numpy as np clusters = range ( 1, 10 ) # to store average distance values for each cluster from 1-9 meandist = # k-means cluster analysis for 9 clusters for k in clusters : # prepare the model model = KMeans ( n_clusters = k ) # fit the model model. describe () # TRAIN-TEST SPLIT # train_feature, test_feature = train_test_split ( feature, random_state = 123, test_size = 0.2 ) print train_feature. columns : # df.columns = dataframe for all features, minus target df = preprocessing. # IMPORT MODULES # import pandas as pd from sklearn import preprocessing from sklearn.cross_validation import train_test_split from sklearn.cluster import KMeans from sklearn.datasets import load_iris # NORMALIZATION # standardise the means to 0 and standard error to 1 for i in df. annotate ( i, ( components * offset, - components * offset ), color = 'orange' ) arrow ( 0, 0, components, - components, \Īlpha = 0.5, facecolor = 'white', head_width =. vlines ( 0, - 0.5, 0.5, linestyles = 'dotted', colors = 'grey' ) # offset for labels offset = 1.07 # arrow & text for a, i in enumerate ( components. set_xlim ( - 0.5, 0.5 ) # reference lines ax2.
xlim ( 20, - 20 ) # individual feature values ax2 = plt. ylabel ( 'Second Principal Component' ) plt. xlabel ( 'First Principal Component' ) plt. scatter ( x_pca, x_pca, c = cancer, cmap = 'plasma', alpha = 0.4, edgecolors = 'black', s = 40 ) plt. figure ( figsize = ( 10, 8 )) # main scatterplot plt. # put feature values into dataframe components = pd.
To this two dimensional representation of the dataset. We can see that malignant and benign cells cluster between two groups and can apply a linear classifier Plotting the PCA-transformed version of the breast cancer dataset. Symbolic Aggregate approXimationĭef pca_explained ( X, threshold ): ''' prints optimal principal components based on threshold of PCA's explained variance Parameters - X : dataframe or array of features threshold : float threshold : break pca_explained ( X, 0.85 ) # 2 components at 61.64% explained variance # 3 components at 77.41% explained variance # 4 components at 86.63% explained variance