-
Notifications
You must be signed in to change notification settings - Fork 0
/
pca_definition.py
107 lines (75 loc) · 3.34 KB
/
pca_definition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import sklearn.datasets as sd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
# The aim is to reduce the featuers to only two elements vector using PCA
# PCA - create new dimmensions using old features with information about the
# variation of the features
# 0. DATA PREPARATION
# dataset contains phisical description of three species od iris
# (Setosa, Versicolour, Virginica) - each of species have 50 instances in
# sequence - which equals 150 rows
# feature vector contains 4 features: sepal length, sepal width, petal length
# and petal width
# load the data from sklearn.datasets
df = sd.load_iris()
# take only numerical data to np array
X = df.data
# take feature vector description
df_header = df.feature_names
# take numerical classes labels
y = df.target
# split the data set into train and test
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.3,
random_state=0);
# 1. STANDARIZATION
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
# 2. CREATE COVARIANCE MATRIX
cov_mat = np.cov(X_train_std.T)
# 3. USING BUILT-IN NUMPY FUNCTION CALCULATE EIGEN VALUES AND EIGEN VECTORS OF
# COVARIANCE MATRIX
# eigen values - number of this values equals to the number of features in vector
# it corespond to covariance matrix, eigen values tell us about
# importance of eigen vectors
# eigen vectors - are the new dimmensions (new features)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
# present the importance of eigen vectors using eigen values
total=sum(eigen_vals)
variation_exp = [(i/total) for i in sorted(eigen_vals, reverse=True)]
cumsum_variation = np.cumsum(variation_exp)
plt.subplot(2,1,1)
plt.title('explained variance ')
plt.bar(range(1, 5), variation_exp, alpha=0.5, align='center', label='explained variance \'importance\'')
plt.step(range(1, 5), cumsum_variation, where='mid', label='cumulative explained variance \'importance\'')
plt.xlabel('Index of the sorted eigenvalues')
plt.ylabel('Explained variance')
plt.legend()
plt.show()
# sort eigen values coresponding to eigen vectors
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
# 4. CRATE TWO DIMMENSIONAL LOADING MATRIX
w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
eigen_pairs[1][1][:, np.newaxis]))
# 5. TRANSFORM ORYGINAL DATA
X_train_pca = X_train_std.dot(w)
# present iris data wit pca1 and pca2 dimmensions
colors = ['b', 'k', 'y']
y_target_label = ['Setosa', 'Versicolour', 'Virginica']
plt.subplot(2,1,2)
for l, c, ytl in zip(np.unique(y_train), colors, y_target_label):
plt.scatter(X_train_pca[y_train == l, 0],
X_train_pca[y_train == l, 1],
c=c,
label= ytl)
plt.title('scatter plot for only two principal components')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.legend();
plt.show()