Python Data Science Memo

General

  • Import
import os, sys
import re
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (16.0, 8.0)
plt.rcParams['figure.figsize'] = (16.0, 8.0)

import seaborn as sns
sns.set()
#sns.reset_orig()

pd.options.display.float_format = '{:,.2f}'.format

#import scipy as sp
#import statsmodels.api as sm
#import statsmodels.formula.api as smf

numpy

random

np.random.seed(0)

x = np.random.binomial(30, 0.5, 1000)
x = np.random.poisson(7, 1000)
x = np.random.normal(5, 10, 10000)
#x = np.random.lognormal(30, 0.4, 1000)

plt.hist(x)
#plt.hist(x, density=True)

#pd.Series(x).plot(kind='kde', style='k--')
#pd.Series(x).hist(density=True)

plt.grid(True)

Pandas

Basic

  • loc, iloc: df.iloc[10, df.columns.get_loc('name')]

groupby

Style

def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

df.style.apply(highlight_max, subset=['B', 'C', 'D'])

Matplotlib

subplots

# using the variable ax for single a Axes
fig, ax = plt.subplots()

# using the variable axs for multiple Axes
fig, axs = plt.subplots(2, 2)

twinx()

p1, = host.plot([0, 1, 2], [0, 1, 2], label="Density", zorder=2.3)
p2, = par1.plot([0, 1, 2], [0, 3, 2], label="Temperature", zorder=2.2)
p3, = par2.plot([0, 1, 2], [50, 30, 15], label="Velocity", zorder=2.1)

Plot function

def func(x):
    return x ** 2 + 2 * x + 1
x = np.linspace(-10, 10,101)
fig, ax = plt.subplots()
ax.plot(x, func(x))
ax.grid(True)
plt.show()

Misc

scikit-learn

  • Linear regression
from sklearn import linear_model
reg = linear_model.LinearRegression()
X = df.loc[:, ['A']].values
Y = df['B'].values
reg.fit(X, Y)
print('Regression coef:', reg.coef_)
print('Intercept:', reg.intercept_)

plt.scatter(X, Y)
plt.plot(X, reg.predict(X))
plt.grid(True)
  • train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
  • Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
  • KMeans
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
X, _ = make_blobs(random_state=10)
kmeans = KMeans(init='random',n_clusters=3)
kmeans.fit(X)
y_pred = kmeans.predict(X)

merge_data = pd.concat([pd.DataFrame(X[:,0]), pd.DataFrame(X[:,1]), pd.DataFrame(y_pred)], axis=1)
merge_data.columns = ['feature1','feature2','cluster']

ax = None
colors = ['blue', 'red', 'green']
for i, data in merge_data.groupby('cluster'):
    ax = data.plot.scatter(x='feature1', y='feature2', color=colors[i], label=f'cluster{i}', ax=ax)
  • from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV holdout method, cross validation, grid search
  • confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc For classification
  • mean_squared_error, mean_absolute_error, median_absolute_error, r2_score For regression
  • from sklearn.ensemble import BaggingClassifier, AdaBoostRegressor Bagging, Boosting
  • Gradient Boosting, Random Forest
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

boston = load_boston() # Housing
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)

models = {
  'RandomForest': RandomForestRegressor(random_state=0),
  'GradientBoost': GradientBoostingRegressor(random_state=0) 
}
scores = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    scores[(model_name, 'train_score')] = model.score(X_train, y_train)
    scores[(model_name, 'test_score')] = model.score(X_test, y_test)

print(pd.Series(scores).unstack())

s = pd.Series(models['RandomForest'].feature_importances_,index=boston.feature_names)
s.sort_values(ascending=False).plot.bar()