General
import os, sys
import re
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (16.0, 8.0)
plt.rcParams['figure.figsize'] = (16.0, 8.0)
import seaborn as sns
sns.set()
pd.options.display.float_format = '{:,.2f}'.format
Matplotlib
subplots
fig, ax = plt.subplots()
fig, axs = plt.subplots(2, 2)
twinx()
p1, = host.plot([0, 1, 2], [0, 1, 2], label="Density", zorder=2.3)
p2, = par1.plot([0, 1, 2], [0, 3, 2], label="Temperature", zorder=2.2)
p3, = par2.plot([0, 1, 2], [50, 30, 15], label="Velocity", zorder=2.1)
Plot function
def func(x):
return x ** 2 + 2 * x + 1
x = np.linspace(-10, 10,101)
fig, ax = plt.subplots()
ax.plot(x, func(x))
ax.grid(True)
plt.show()
scikit-learn
from sklearn import linear_model
reg = linear_model.LinearRegression()
X = df.loc[:, ['A']].values
Y = df['B'].values
reg.fit(X, Y)
print('Regression coef:', reg.coef_)
print('Intercept:', reg.intercept_)
plt.scatter(X, Y)
plt.plot(X, reg.predict(X))
plt.grid(True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
X, _ = make_blobs(random_state=10)
kmeans = KMeans(init='random',n_clusters=3)
kmeans.fit(X)
y_pred = kmeans.predict(X)
merge_data = pd.concat([pd.DataFrame(X[:,0]), pd.DataFrame(X[:,1]), pd.DataFrame(y_pred)], axis=1)
merge_data.columns = ['feature1','feature2','cluster']
ax = None
colors = ['blue', 'red', 'green']
for i, data in merge_data.groupby('cluster'):
ax = data.plot.scatter(x='feature1', y='feature2', color=colors[i], label=f'cluster{i}', ax=ax)
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
holdout method, cross validation, grid search
confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
For classification
mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
For regression
from sklearn.ensemble import BaggingClassifier, AdaBoostRegressor
Bagging, Boosting
- Gradient Boosting, Random Forest
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)
models = {
'RandomForest': RandomForestRegressor(random_state=0),
'GradientBoost': GradientBoostingRegressor(random_state=0)
}
scores = {}
for model_name, model in models.items():
model.fit(X_train, y_train)
scores[(model_name, 'train_score')] = model.score(X_train, y_train)
scores[(model_name, 'test_score')] = model.score(X_test, y_test)
print(pd.Series(scores).unstack())
s = pd.Series(models['RandomForest'].feature_importances_,index=boston.feature_names)
s.sort_values(ascending=False).plot.bar()