# Scikit-learn, see https://scikit-learn.org
import sklearn
import sklearn.svm
import sklearn.datasets
# Math operations
import numpy as np
# Drawing functions
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
Creating two clusters with random points on a 2D plain grouped in two classes
number_of_samples = [500, 500]
centers = [[0.0, 0.0], [3.0, 3.0]]
clusters_std = [1.5, 0.5]
x, y = sklearn.datasets.make_blobs(n_samples=number_of_samples, centers=centers,
cluster_std=clusters_std, shuffle=True,
random_state=0)
i = 50
plt.scatter(x[y==1][:i, 0], x[y==1][:i, 1], alpha=0.5, c="#00ff80", label="Good Email")
plt.scatter(x[y==0][:i, 0], x[y==0][:i, 1], alpha=0.5, c="blue", label="Spam Email")
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.legend()
plt.savefig("./figures/02_email-cluster-050.png", dpi=500, bbox_inches = 'tight', pad_inches = 0)
i = 500
plt.scatter(x[y==1][:i, 0], x[y==1][:i, 1], alpha=0.5, c="#00ff80", label="Good Email")
plt.scatter(x[y==0][:i, 0], x[y==0][:i, 1], alpha=0.5, c="blue", label="Spam Email")
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.legend()
plt.savefig("./figures/02_email-cluster-500.png", dpi=500, bbox_inches = 'tight', pad_inches = 0)
# use the first i samples
i = 50
#specify the SVMs parameter
classifier = sklearn.svm.SVC(C=10, gamma="scale", kernel="linear") #gamma=0.1 degree=3
# train the SVM
classifier.fit(x[:i], y[:i])
## plot the results
fig, ax = plt.subplots()
plt.scatter(x[y==1][:i, 0], x[y==1][:i, 1], alpha=0.5, c="#00ff80", label="Good Email")
plt.scatter(x[y==0][:i, 0], x[y==0][:i, 1], alpha=0.5, c="blue", label="Spam Email")
# calculate and plot decision boundary
xx = np.linspace(-5, 5, 30)
yy = np.linspace(-5, 5, 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = classifier.decision_function(xy).reshape(XX.shape)
ax.contour(XX, YY, Z, colors='gray', levels=[-1, 0, 1], linestyles=['--', '-', '--'])
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.legend()
plt.savefig("./figures/02_email-svm-050-linear.png", dpi=500, bbox_inches = 'tight', pad_inches = 0)
plt.show()
## plot the results
fig, ax = plt.subplots()
i = 500
plt.scatter(x[y==1][:i, 0], x[y==1][:i, 1], alpha=0.5, c="#00ff80", label="Good Email")
plt.scatter(x[y==0][:i, 0], x[y==0][:i, 1], alpha=0.5, c="blue", label="Spam Email")
#plt.scatter(X[:i, 0], X[:i, 1], c=y[:i], alpha=0.5, cmap="winter", label="Data points")
# calculate and plot decision boundary
xx = np.linspace(-5, 5, 30)
yy = np.linspace(-5, 5, 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = classifier.decision_function(xy).reshape(XX.shape)
ax.contour(XX, YY, Z, colors='gray', levels=[-1, 0, 1], linestyles=['--', '-', '--'])
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.legend()
plt.savefig("./figures/02_email-svm-500-linear.png", dpi=500, bbox_inches = 'tight', pad_inches = 0)
plt.show()
import sklearn.cluster
fig, ax = plt.subplots()
plt.scatter(x[:, 0], x[:, 1], c="gray", alpha=0.5, label="Unknown Email")
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.legend()
plt.savefig("./figures/02_email-kmeans-unknown.png", dpi=500, bbox_inches = 'tight', pad_inches = 0)
plt.show()
y_pred = sklearn.cluster.KMeans(n_clusters=2).fit_predict(x)
fig, ax = plt.subplots()
plt.scatter(x[y_pred == 1][:, 0], x[y_pred == 1][:, 1], alpha=0.5, c="red", label="? Email")
plt.scatter(x[y_pred == 0][:, 0], x[y_pred == 0][:, 1], alpha=0.5, c="orange", label="? Email")
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.legend()
plt.savefig("./figures/02_email-kmeans-cluster.png", dpi=500, bbox_inches = 'tight', pad_inches = 0)
plt.show()