import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from mpl_toolkits.mplot3d import Axes3D
import pydotplus as pyd
from IPython.display import Image
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.decomposition import PCA
df = pd.read_csv('data.csv',header=None)
df.head()
X = df.iloc[:,0].values.reshape(-1,1)
y = df.iloc[:,1].values.reshape(-1,1)
linear = LinearRegression().fit(X, y)
print('Coefficient:', linear.coef_)
print('Intercept', linear.intercept_)
plt.plot(X, y, '.')
plt.plot(X, linear.predict(X))
plt.show()
1.1.2 多元线性回归Multivariable linear regression:多个自变量一个因变量
df = pd.read_csv('Delivery.csv',header=None)
df.head()
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values.reshape(-1,1)
linear = LinearRegression().fit(X, y)
print('Coefficient:', linear.coef_)
print('Intercept', linear.intercept_)
x0 = df.iloc[:,0]
x1 = df.iloc[:,1]
ax = plt.figure().add_subplot(projection='3d')
ax.scatter(x0,x1,y,c='r',s=100)
x0, x1 = np.meshgrid(x0, x1)
z = x0*linear.coef_[0][0] + x1*linear.coef_[0][1] + linear.intercept_[0]
ax.plot_surface(x0, x1, z)
plt.show()
1.1.3 多项式回归Polynomial regression:曲线,提取多项式系数再用线性
df = pd.read_csv('job.csv')
df.head()
X = df.iloc[:,1].values.reshape(-1,1)
y = df.iloc[:,2].values.reshape(-1,1)
X_poly = PolynomialFeatures(5).fit_transform(X)
linear = LinearRegression().fit(X_poly, y)
plt.plot(X, y, '.')
plt.plot(X, linear.predict(X_poly))
plt.show()
标准方程法: 直接求代价函数的全局最优解,时间复杂度O(n^3),n=特征数量
特征缩放: 使特征数值达到合适大小
交叉验证法: 数据样本小,循环使每份数据都作训练集和测试集
过拟合Overfitting: 模型过于复杂,训练集好,测试集差 (欠拟合,正确拟合)
防止过拟合: 减少干扰特征, 增加数据量, 正则化
正则化Regularization: L2正则化(加平方和), L1正则化(加绝对值和)
1.2.1 岭回归Ridge regression: 最早用于解决数据特征比样本点多时不是满秩矩阵不可逆的问题, 代价函数L2正则化, 现在也用于在估计中加入偏差使更准确, 也可解决多重共线性的问题, 是一种有偏估计
选择岭系数值使各回归系数的岭估计基本稳定, 残差平方和增大不多
1.2.2 LASSO: 擅长处理多重共线性的数据, 也是有偏估计, 代价函数L1正则化, 会使某些(噪声)参数的系数等于零, 而岭回归只会接近于零
1.2.3 弹性网Elastic Net: 岭回归和LASSO正则特点的结合
df = pd.read_csv('longley.csv',index_col=0)
df.head()
X = df.iloc[:,1:]
y = df.iloc[:,0]
alphas_test = np.linspace(0.001, 1)
ridge = RidgeCV(alphas=alphas_test, store_cv_values=True).fit(X, y)
lasso = LassoCV().fit(X, y)
elastic = ElasticNetCV().fit(X, y)
print('Alpha of Ridge:', ridge.alpha_)
print('Coefficient of Ridge:', ridge.coef_)
print('\nAlpha of LASSO:', lasso.alpha_)
print('Coefficient of LASSO:', lasso.coef_)
print('\nAlpha of Elastic Net:', elastic.alpha_)
print('Coefficient of Elastic Net:', elastic.coef_)
plt.figure(figsize=(14,4))
plt.subplot(121)
plt.plot(alphas_test, ridge.cv_values_.mean(axis=0))
plt.plot(ridge.alpha_, min(ridge.cv_values_.mean(axis=0)), 'ro')
plt.title('Alpha values and loss function')
plt.subplot(122)
x = np.arange(len(y))
plt.plot(x, y, label='real')
plt.plot(x, ridge.predict(X), 'y', label='Ridge')
plt.plot(x, lasso.predict(X), 'g', label='LASSO')
plt.plot(x, elastic.predict(X), 'r', label='Elastic Net')
plt.legend()
plt.show()
df = pd.read_csv('LR-testSet.csv',header=None)
df.head()
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
logi = LogisticRegression().fit(X, y)
print('Classification report:')
print(classification_report(y, logi.predict(X)))
print('Score:', logi.score(X, y))
plt.plot(X[y==0][0], X[y==0][1], 'bo', label='label 0')
plt.plot(X[y==1][0], X[y==1][1], 'rx', label='label 1')
plt.legend()
x_boundary = np.array([min(X[0]), max(X[0])])
y_boundary = (-logi.intercept_ - x_boundary*logi.coef_[0][0]) / logi.coef_[0][1]
plt.plot(x_boundary, y_boundary, 'k')
plt.show()
2.1.2 非线性逻辑回归: 提取多项式特征再用线性逻辑回归 (同多项式回归)
df = pd.read_csv('LR-testSet2.txt',header=None)
df.head()
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_poly = PolynomialFeatures(5).fit_transform(X)
logi = LogisticRegression().fit(X_poly, y)
print('Classification report:')
print(classification_report(y, logi.predict(X_poly)))
print('Score:', logi.score(X_poly, y))
xx, yy = np.meshgrid(np.arange(min(X[0])-1, max(X[0])+1, 0.02),
np.arange(min(X[1])-1, max(X[1])+1, 0.02))
zz = np.c_[xx.ravel(), yy.ravel()]
zz_poly = PolynomialFeatures(5).fit_transform(zz)
zz_predict = logi.predict(zz_poly).reshape(xx.shape)
plt.contourf(xx, yy ,zz_predict, alpha=0.8)
plt.scatter(X[0], X[1], c=y)
plt.show()
最近邻规则分类, 用欧式距离Euclidean distance
k值一般取单数, 算法复杂度较高, 样本分布不平衡时 (某一样本过大)可能不准确
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
knn = KNeighborsClassifier().fit(X_train, y_train)
print('Classification report:')
print(classification_report(y_test, knn.predict(X_test)))
print('Score:', knn.score(X_test, y_test))
适合分析离散数据 (连续数据先转成离散)
信息熵Entropy: 对不确定性的度量, 信息熵越大, 不确定性越高
ID3算法: 根据信息增益最大化来划分决策树的节点, 信息增益最大的作为根节点, 处理连续变量时以信息增益最大化来划分阈值,并转化为离散变量
2.3.1 C4.5算法: 引入增益率求信息增益比, 以改进ID3方法倾向于选择因子数较多的变量的问题
df = pd.read_csv('AllElectronics.csv', index_col=0)
df.head()
X = pd.get_dummies(df.iloc[:,:-1])
y = df.iloc[:,-1]
dtree = DecisionTreeClassifier(criterion='entropy').fit(X,y)
dot = export_graphviz(dtree, feature_names=X.columns, class_names=y.unique(), filled=True, rounded=True)
graph = pyd.graph_from_dot_data(dot)
graph.set_size("5,5!")
Image(graph.create_png())
2.3.2 CART算法: 根据基尼不纯度Gini最小化来进行特征选择, 递归地构建二叉决策树, 基尼系数增益值最大的属性为根节点属性, 对小规模数据集有效
df = pd.read_csv('cart.csv', index_col=0)
df.head()
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
dtree = DecisionTreeClassifier().fit(X,y)
dot = export_graphviz(dtree, feature_names=X.columns, class_names=['No','Yes'], filled=True, rounded=True)
graph = pyd.graph_from_dot_data(dot)
graph.set_size("3,5!")
Image(graph.create_png())
2.3.3 剪枝Pruning: 包括预剪枝和后剪枝, 减少节点, 降低算法复杂度, 防止过拟合
def plotLr(model,title,a):
print(f'Score for {title}: {model.score(X_test, y_test)}')
xx, yy = np.meshgrid(np.arange(min(X[0])-1, max(X[0])+1, 0.02),
np.arange(min(X[1])-1, max(X[1])+1, 0.02))
zz_predict = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.subplot(a)
plt.title(title)
plt.contourf(xx, yy ,zz_predict, alpha=0.8)
plt.scatter(X[0], X[1], c=y)
df = pd.read_csv('LR-testSet2.txt',header=None)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y)
dtree = DecisionTreeClassifier().fit(X_train, y_train)
pruning_dtree = DecisionTreeClassifier(max_depth=6,min_samples_split=4).fit(X_train, y_train)
plt.figure(figsize=(14,4))
print('Score of train:', dtree.score(X_train, y_train))
plotLr(dtree,'Non-pruning',121)
print('Score of train:', pruning_dtree.score(X_train, y_train))
plotLr(pruning_dtree,'Pruning',122)
适合对文本数据的分析, 使用先验概率, 贝叶斯算法在有多个特征的情况下会使统计量巨大, 朴素贝叶斯就是假定特征之间相互独立
2.4.1 伯努利模型Bernoulli不考虑重复词语,多项式模型Multinomial考虑重复词语,混合模型训练时考虑测试时不考虑,高斯模型Gaussian适合处理连续变量
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
ber_nb = BernoulliNB().fit(X_train, y_train)
mul_nb = MultinomialNB().fit(X_train, y_train)
gau_nb = GaussianNB().fit(X_train, y_train)
print('BernoulliNB score:', ber_nb.score(X_test, y_test))
print('Confusion matrix:\n',confusion_matrix(y_test, ber_nb.predict(X_test)))
print('\nMultinomialNB score:', mul_nb.score(X_test, y_test))
print('Confusion matrix:\n',confusion_matrix(y_test, mul_nb.predict(X_test)))
print('\nGaussianNB score:', gau_nb.score(X_test, y_test))
print('Confusion matrix:\n',confusion_matrix(y_test, gau_nb.predict(X_test)))
2.4.2 词袋模型Bag of words: 用一组无序的单词来表达一段文本, 显示单词和出现次数, 把文本数据用CountVectorizer矢量化再使用模型, 也可过滤停用词
def printBow(i):
word = cv.get_feature_names()[order[i]]
freq = count_total[order[i]]
print(f'No.{i+1}: The word [{word}] occurs [{freq}] times.')
news = datasets.fetch_20newsgroups(subset='all')
X, y = news.data[:3000], news.target[:3000]
cv = CountVectorizer(stop_words='english')
X_cv = cv.fit_transform(X)
count_total = X_cv.toarray().sum(axis=0)
order = np.argsort(-count_total)
print('In 3000 news (frequency):')
for i in range(5): printBow(i)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)
mul_nb = MultinomialNB().fit(X_train_cv, y_train)
print('\nScore for train:',mul_nb.score(X_train_cv, y_train))
print('Score for test:',mul_nb.score(X_test_cv, y_test))
2.4.3 TF-IDF算法: 提取词频Term Frequency(TF), 过滤停用词Stop words, 再以逆文档频率Inverse Document Frequency(IDF)作为权重, IDF与词的常见程度成反比, 将TF和IDF相乘得到词的重要性, 用TfidfVectorizer完成向量化和TFIDF处理
def printTf(i):
weight = X_tf.toarray()[i]
order = np.argsort(-weight)
word_list = []
for j in range(5):
word = tf.get_feature_names()[order[j]]
word_list.append(word)
print(f'In news {i+1}: {word_list}')
def printCv(i):
count = X_cv.toarray()[i]
order = np.argsort(-count)
word_list = []
for j in range(5):
word = cv.get_feature_names()[order[j]]
word_list.append(word)
print(f'In news {i+1}: {word_list}')
news = datasets.fetch_20newsgroups(subset='all')
X, y = news.data[:3000], news.target[:3000]
tf = TfidfVectorizer(stop_words='english')
X_tf = tf.fit_transform(X)
print('Top 5 key words (TFIDF):')
for i in range(5): printTf(i)
print('\nTop 5 key words (frequency):')
for i in range(5): printCv(i)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)
mul_nb = MultinomialNB().fit(X_train_tf, y_train)
print('\nScore for train:',mul_nb.score(X_train_tf, y_train))
print('Score for test:',mul_nb.score(X_test_tf, y_test))
适合图像识别等复杂分类情况, SVM寻找区分两类的超平面Hyper plane使边际Margin最大, 是深度学习 (2012) 出现前最好的算法
2.5.1 线性可分情况
运算转化为凸优化问题: 1.无约束优化问题, 费马定理 2.带等式约束的优化问题, 拉格朗日乘子法 3.带不等式约束的优化问题, KKT条件 (拉格朗日乘子法的推广)
拉格朗日乘子法进一步转化为对偶问题, 使用SMO算法优化
df = pd.read_csv('LR-testSet.csv',header=None)
X, y = df.iloc[:,:-1], df.iloc[:,-1]
svc = SVC(kernel='linear').fit(X, y)
print('Some support vectors:\n', svc.support_vectors_[:5])
print('Index of the support vectors:', svc.support_[:5])
print('Number of support vectors (on each side):', svc.n_support_)
x = np.array([min(X[0]), max(X[0])])
k = -svc.coef_[0][0]/svc.coef_[0][1]
d = -svc.intercept_/svc.coef_[0][1]
y_bountry = k*x + d
v1 = svc.support_vectors_[5]
v2 = svc.support_vectors_[-1]
y_v1 = k*x + (v1[1] - k*v1[0])
y_v2 = k*x + (v2[1] - k*v2[0])
plt.plot(X[y==0][0], X[y==0][1], 'bo', label='label 0')
plt.plot(X[y==1][0], X[y==1][1], 'rx', label='label 1')
plt.plot(x, y_bountry, 'k', label='Decision border')
plt.plot(x, y_v1, 'r--', label='Support vector 1')
plt.plot(x, y_v2, 'b--', label='Support vector 2')
plt.legend()
plt.show()
线性不可分情况时: 引入松弛变量Slack variable和惩罚函数Penalty function
使分错的点越少越好, 距离分类边界越近越好, 改进线性不可分情况下的对偶问题
2.5.2 非线性情况时: 把低维空间的非线性问题映射到高维空间, 变成求解线性问题
可能导致维度灾难大幅增加运算时间, 引入核函数解决非线性映射: h次多项式核函数, 高斯径向基函数核函数, S型核函数
SVM优点: 1. 模型的算法复杂度取决于支持向量个数不容易过拟合 2.模型完全依赖于支持向量不依赖其余非支持向量 3.如果训练得出支持向量数量少模型容易被范化
df = pd.read_csv('LR-testSet2.txt',header=None)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y)
svc = SVC().fit(X_train, y_train)
print('Score of train:', svc.score(X_train, y_train))
plotLr(svc,'SVC',111)
即深度学习Deep learning, 近年随大数据算力算法而迅速发展
单层感知器SLP(Single Layer Perceptron): 不能解决非线性问题如异或问题
线性神经网络: 结构与感知器相似, 激活函数由sign函数改为线性purelin函数 (y=x), 引入非线性项可解决异或问题
Delta学习规则: 利用梯度下降法的连续感知器学习规则
BP神经网络(Back Propagation Neural Network): 误差反向传播, 解决了多层神经网络的学习问题, 网络结构包括输入层, 隐藏层和输出层
常见激活函数Activation function: Sigmoid函数(逻辑回归函数)(0,1), Tanh函数(双曲正切函数)(-1,1), Softsign函数(比Tanh平滑)(-1,1), ReLU函数(线性整流函数)(最常用)
多层感知器MLP(Multi Layer Perceptron): 应用BP算法, 包含隐藏层
digits = datasets.load_digits()
X, y = digits.data, digits.target
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)
mlp = MLPClassifier().fit(X_train, y_train)
print('Confusion matrix:')
print(confusion_matrix(y_test, mlp.predict(X_test)))
print('\nClassification report:')
print(classification_report(y_test, mlp.predict(X_test)))
组合多个学习器以得到一个更好的学习器
2.7.1 装袋Bagging: 也叫Bootstrap aggregating, 是一种有放回抽样, 分出多组数据分别使用模型并组合投票, 适合复杂的数据, 个体学习器之间不存在强依赖关系
def plotIris(model,title,a):
print(f'Score for {title}: {model.score(X_test, y_test)}')
xx, yy = np.meshgrid(np.arange(min(X[:,0])-1, max(X[:,0])+1, 0.02),
np.arange(min(X[:,1])-1, max(X[:,1])+1, 0.02))
zz_predict = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.subplot(a)
plt.title(title)
plt.contourf(xx, yy ,zz_predict)
plt.scatter(X[:,0], X[:,1], c=y)
iris = datasets.load_iris()
X = iris.data[:,:2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
knn = KNeighborsClassifier().fit(X_train, y_train)
dtree = DecisionTreeClassifier().fit(X_train, y_train)
bagging_knn = BaggingClassifier(knn, n_estimators=100).fit(X_train, y_train)
bagging_dtree = BaggingClassifier(dtree, n_estimators=100).fit(X_train, y_train)
plt.figure(figsize=(14,8))
plotIris(knn,'knn',221)
plotIris(bagging_knn,'bagging knn',223)
plotIris(dtree,'dtree',222)
plotIris(bagging_dtree,'bagging dtree',224)
2.7.2 随机森林Random forest: RF=决策树+Bagging+随机属性选择, 用bagging随机产生样本, 再随机选择特征建立CART决策树, 多颗决策树构成随机森林并投票表决结果, 一般比单一决策树准确
df = pd.read_csv('LR-testSet2.txt',header=None)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y)
dtree = DecisionTreeClassifier().fit(X_train, y_train)
RF = RandomForestClassifier().fit(X_train, y_train)
plt.figure(figsize=(14,4))
plotLr(dtree,'dtree',121)
plotLr(RF,'RF',122)
2.7.3 自适应增强Adaboost (提升算法boosting): 抽样时增大被前一弱分类器错误分类的样本的权值, 引入并训练下一弱分类器, 直到达到预定错误率或最大迭代次数时才确定最终的强分类器, 在强分类器中加大错误率低的弱分类器的权重, 个体分类器之间存在强依赖关系
df = pd.read_csv('LR-testSet2.txt',header=None)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y)
dtree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
adaboost = AdaBoostClassifier(dtree).fit(X_train, y_train)
plt.figure(figsize=(14,4))
plotLr(dtree,'dtree',121)
plotLr(adaboost,'adaboost',122)
2.7.4 Stacking: 使用多个不同分类器对训练集进行预测, 把预测结果输入一个次级分类器再输出得到最终预测结果
Voting: 对不同分类器的预测结果直接投票, 没有次级分类器
def plotIrisCV(model,title,a):
score = cross_val_score(model, X, y, cv=3).mean()
print(f'CV Score for {title}: {score}')
xx, yy = np.meshgrid(np.arange(min(X[:,0])-1, max(X[:,0])+1, 0.02),
np.arange(min(X[:,1])-1, max(X[:,1])+1, 0.02))
zz_predict = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.subplot(a)
plt.title(title)
plt.contourf(xx, yy ,zz_predict)
plt.scatter(X[:,0], X[:,1], c=y)
iris = datasets.load_iris()
X = iris.data[:,1:3]
y = iris.target
knn = KNeighborsClassifier(n_neighbors=1).fit(X, y)
dtree = DecisionTreeClassifier().fit(X, y)
logi = LogisticRegression().fit(X, y)
gau_nb = GaussianNB().fit(X, y)
stacking = StackingClassifier([knn,dtree,logi,gau_nb], logi).fit(X, y)
voting = VotingClassifier([('1',knn),('2',dtree),('3',logi),('4',gau_nb)]).fit(X, y)
plt.figure(figsize=(14,12))
plotIrisCV(knn,'knn',321)
plotIrisCV(dtree,'dtree',322)
plotIrisCV(logi,'logi',323)
plotIrisCV(gau_nb,'gau_nb',324)
plotIrisCV(stacking,'stacking',325)
plotIrisCV(voting,'voting',326)
data = np.genfromtxt('kmeans.txt')
data[:5]
kmeans = KMeans(4).fit(data)
result = kmeans.predict(data)
centers = kmeans.cluster_centers_
print('Cluster centers:\n', centers)
mark = ['or', 'ob', 'og', 'oy']
for i,d in enumerate(data):
plt.plot(d[0], d[1], mark[result[i]])
mark = ['*r', '*b', '*g', '*y']
for i,center in enumerate(centers):
plt.plot(center[0], center[1], mark[i], markersize=20)
xx, yy = np.meshgrid(np.arange(min(data[:,0])-1, max(data[:,0])+1, 0.02),
np.arange(min(data[:,1])-1, max(data[:,1])+1, 0.02))
zz = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.contourf(xx, yy ,zz, alpha=0.8)
plt.show()
K-Means缺点: 1.对初始质心比较敏感容易陷入局部最小值 2.需要用户自己选择合适的k值 3.只能根据距离分类, 不能处理密度分布的数据 4.数据量大时收敛较慢
优化1: 使用多次随机初始化并计算每次建模的代价函数, 取代价函数最小结果为聚类结果
优化2: 使用肘部法则选择k值 (观察代价函数关于k值的图像)
优化3: 使用DBSCAN
优化4: 使用Mini Batch K-means
3.1.2 Mini Batch K-means: 是K-means的变种, 每次只抽取小的数据子集进行训练, 大大减少了计算时间, 结果一般只略差于标准算法, 适合大数据量的情况
data = np.genfromtxt('kmeans.txt')
kmeansMini = MiniBatchKMeans(4).fit(data)
mark = ['or', 'ob', 'og', 'oy']
for i,d in enumerate(data):
plt.plot(d[0], d[1], mark[kmeansMini.predict(data)[i]])
mark = ['*r', '*b', '*g', '*y']
for i,center in enumerate(kmeansMini.cluster_centers_):
plt.plot(center[0], center[1], mark[i], markersize=20)
将具有足够高密度的区域划分为簇, 并可以发现任何形状的聚类 (基于密度)
指定合适的Epsilon和Minpoints, 若点p的E领域有超过Minpoints个点则创建以p为核心点的新簇, 将核心点直接密度可达或密度可达的点加入相应的簇, 合并核心点密度相连的簇, 当没有新的点能被添加到任何簇时算法结束
缺点: 数据量大时要求较大内存支持和I/O消耗, 聚类的密度不均匀聚类间距差相差大时效果较差
优点: 不需要输入聚类个数, 聚类簇的形状没有要求, 可以输入过滤噪声的参数(Epsilon和Minpoints)
x1, y1 = datasets.make_circles(2000, factor=0.2, noise=0.1)
x2, y2 = datasets.make_blobs(1000, centers=[[1.2,1.2]], cluster_std=0.1)
x = np.concatenate((x1, x2))
y_kmeans = KMeans(3).fit_predict(x)
y_dbscan = DBSCAN(eps=0.2, min_samples=50).fit_predict(x)
ratio = len(y_dbscan[y_dbscan[:]==-1]) / len(y_dbscan)
clusters = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
print('Noise ratio:', format(ratio,'.2%'))
print('Estimated number of clusters: %d'% clusters)
plt.figure(figsize=(14,4))
plt.subplot(121)
plt.title('K-Means')
plt.scatter(x[:,0], x[:,1], c=y_kmeans)
plt.subplot(122)
plt.title('DBSCAN')
plt.scatter(x[:,0], x[:,1], c=y_dbscan)
plt.show()
是一种降维算法, 可用于多维数据的可视化, 找到数据最重要的方向并作投影
数据预处理(中心化), 求样本的协方差矩阵并做特征值分解, 选出最大的k个特征值对应的k个特征向量, 并将原始数据投影到选取的特征向量上
方差描述离散程度, 协方差描述相关性: 接近1正相关, 接近-1负相关, 接近0不相关
iris = datasets.load_iris()
X, y = iris.data, iris.target
print('Data shape:', X.shape)
X_2d = PCA(2).fit_transform(X)
X_3d = PCA(3).fit_transform(X)
fig = plt.figure(figsize=(14,4))
ax = fig.add_subplot(121)
ax.scatter(X_2d[:,0], X_2d[:,1], c=y)
plt.title('4D - 2D')
ax = fig.add_subplot(122, projection='3d')
ax.scatter(X_3d[:,0], X_3d[:,1], X_3d[:,2], c=y)
plt.title('4D - 3D')
plt.show()