House King County

Download the notebook, the raw script, or the annotated script for this tutorial (right-click on the link and save).

Getting started

This tutorial is adapted from the corresponding MLR3 tutorial.

Loading and preparing the data

using MLJ
using PrettyPrinting
import DataFrames: DataFrame, select!, Not, describe
import Statistics
using Dates
using PyPlot
using UrlDownload


df = DataFrame(urldownload("https://raw.githubusercontent.com/tlienart/DataScienceTutorialsData.jl/master/data/kc_housing.csv", true))
describe(df)
21×8 DataFrame
│ Row │ variable      │ mean       │ min             │ median    │ max             │ nunique │ nmissing │ eltype   │
│     │ Symbol        │ Union…     │ Any             │ Union…    │ Any             │ Union…  │ Nothing  │ DataType │
├─────┼───────────────┼────────────┼─────────────────┼───────────┼─────────────────┼─────────┼──────────┼──────────┤
│ 1   │ id            │ 4.5803e9   │ 1000102         │ 3.90493e9 │ 9900000190      │         │          │ Int64    │
│ 2   │ date          │            │ 20140502T000000 │           │ 20150527T000000 │ 372     │          │ String   │
│ 3   │ price         │ 540088.0   │ 75000.0         │ 450000.0  │ 7.7e6           │         │          │ Float64  │
│ 4   │ bedrooms      │ 3.37084    │ 0               │ 3.0       │ 33              │         │          │ Int64    │
│ 5   │ bathrooms     │ 2.11476    │ 0.0             │ 2.25      │ 8.0             │         │          │ Float64  │
│ 6   │ sqft_living   │ 2079.9     │ 290             │ 1910.0    │ 13540           │         │          │ Int64    │
│ 7   │ sqft_lot      │ 15107.0    │ 520             │ 7618.0    │ 1651359         │         │          │ Int64    │
│ 8   │ floors        │ 1.49431    │ 1.0             │ 1.5       │ 3.5             │         │          │ Float64  │
│ 9   │ waterfront    │ 0.00754176 │ 0               │ 0.0       │ 1               │         │          │ Int64    │
│ 10  │ view          │ 0.234303   │ 0               │ 0.0       │ 4               │         │          │ Int64    │
│ 11  │ condition     │ 3.40943    │ 1               │ 3.0       │ 5               │         │          │ Int64    │
│ 12  │ grade         │ 7.65687    │ 1               │ 7.0       │ 13              │         │          │ Int64    │
│ 13  │ sqft_above    │ 1788.39    │ 290             │ 1560.0    │ 9410            │         │          │ Int64    │
│ 14  │ sqft_basement │ 291.509    │ 0               │ 0.0       │ 4820            │         │          │ Int64    │
│ 15  │ yr_built      │ 1971.01    │ 1900            │ 1975.0    │ 2015            │         │          │ Int64    │
│ 16  │ yr_renovated  │ 84.4023    │ 0               │ 0.0       │ 2015            │         │          │ Int64    │
│ 17  │ zipcode       │ 98077.9    │ 98001           │ 98065.0   │ 98199           │         │          │ Int64    │
│ 18  │ lat           │ 47.5601    │ 47.1559         │ 47.5718   │ 47.7776         │         │          │ Float64  │
│ 19  │ long          │ -122.214   │ -122.519        │ -122.23   │ -121.315        │         │          │ Float64  │
│ 20  │ sqft_living15 │ 1986.55    │ 399             │ 1840.0    │ 6210            │         │          │ Int64    │
│ 21  │ sqft_lot15    │ 12768.5    │ 651             │ 7620.0    │ 871200          │         │          │ Int64    │

We drop unrelated columns

select!(df, Not([:id, :date]))
schema(df)
┌───────────────┬─────────┬────────────┐
│ _.names       │ _.types │ _.scitypes │
├───────────────┼─────────┼────────────┤
│ price         │ Float64 │ Continuous │
│ bedrooms      │ Int64   │ Count      │
│ bathrooms     │ Float64 │ Continuous │
│ sqft_living   │ Int64   │ Count      │
│ sqft_lot      │ Int64   │ Count      │
│ floors        │ Float64 │ Continuous │
│ waterfront    │ Int64   │ Count      │
│ view          │ Int64   │ Count      │
│ condition     │ Int64   │ Count      │
│ grade         │ Int64   │ Count      │
│ sqft_above    │ Int64   │ Count      │
│ sqft_basement │ Int64   │ Count      │
│ yr_built      │ Int64   │ Count      │
│ yr_renovated  │ Int64   │ Count      │
│ zipcode       │ Int64   │ Count      │
│ lat           │ Float64 │ Continuous │
│ long          │ Float64 │ Continuous │
│ sqft_living15 │ Int64   │ Count      │
│ sqft_lot15    │ Int64   │ Count      │
└───────────────┴─────────┴────────────┘
_.nrows = 21613

Afterwards, we convert the zip code to an unordered factor (Multiclass), we also create two binary features isrenovated and has_basement derived from yr_renovated and sqft_basement:

coerce!(df, :zipcode => Multiclass)
df.isrenovated  = @. !iszero(df.yr_renovated)
df.has_basement = @. !iszero(df.sqft_basement)
schema(df)
┌───────────────┬──────────────────────────────────────────────────┬────────────────┐
│ _.names       │ _.types                                          │ _.scitypes     │
├───────────────┼──────────────────────────────────────────────────┼────────────────┤
│ price         │ Float64                                          │ Continuous     │
│ bedrooms      │ Int64                                            │ Count          │
│ bathrooms     │ Float64                                          │ Continuous     │
│ sqft_living   │ Int64                                            │ Count          │
│ sqft_lot      │ Int64                                            │ Count          │
│ floors        │ Float64                                          │ Continuous     │
│ waterfront    │ Int64                                            │ Count          │
│ view          │ Int64                                            │ Count          │
│ condition     │ Int64                                            │ Count          │
│ grade         │ Int64                                            │ Count          │
│ sqft_above    │ Int64                                            │ Count          │
│ sqft_basement │ Int64                                            │ Count          │
│ yr_built      │ Int64                                            │ Count          │
│ yr_renovated  │ Int64                                            │ Count          │
│ zipcode       │ CategoricalArrays.CategoricalValue{Int64,UInt32} │ Multiclass{70} │
│ lat           │ Float64                                          │ Continuous     │
│ long          │ Float64                                          │ Continuous     │
│ sqft_living15 │ Int64                                            │ Count          │
│ sqft_lot15    │ Int64                                            │ Count          │
│ isrenovated   │ Bool                                             │ Count          │
│ has_basement  │ Bool                                             │ Count          │
└───────────────┴──────────────────────────────────────────────────┴────────────────┘
_.nrows = 21613

These created variables should be treated as OrderedFactor,

coerce!(df, :isrenovated => OrderedFactor, :has_basement => OrderedFactor);

The feature waterfront is currently encoded as a string, but it's really just a boolean:

unique(df.waterfront)
2-element Array{Int64,1}:
 0
 1

So let's recode it

df.waterfront = (df.waterfront .!= "FALSE")
coerce!(df, :waterfront => OrderedFactor);

For a number of the remaining features which are treated as Count there are few unique values in which case it might make more sense to recode them as OrderedFactor, this can be done with autotype:

coerce!(df, autotype(df, :few_to_finite))
schema(df)
┌───────────────┬────────────────────────────────────────────────────┬───────────────────┐
│ _.names       │ _.types                                            │ _.scitypes        │
├───────────────┼────────────────────────────────────────────────────┼───────────────────┤
│ price         │ Float64                                            │ Continuous        │
│ bedrooms      │ CategoricalArrays.CategoricalValue{Int64,UInt32}   │ OrderedFactor{13} │
│ bathrooms     │ CategoricalArrays.CategoricalValue{Float64,UInt32} │ OrderedFactor{30} │
│ sqft_living   │ Int64                                              │ Count             │
│ sqft_lot      │ Int64                                              │ Count             │
│ floors        │ CategoricalArrays.CategoricalValue{Float64,UInt32} │ OrderedFactor{6}  │
│ waterfront    │ CategoricalArrays.CategoricalValue{Bool,UInt32}    │ OrderedFactor{1}  │
│ view          │ CategoricalArrays.CategoricalValue{Int64,UInt32}   │ OrderedFactor{5}  │
│ condition     │ CategoricalArrays.CategoricalValue{Int64,UInt32}   │ OrderedFactor{5}  │
│ grade         │ CategoricalArrays.CategoricalValue{Int64,UInt32}   │ OrderedFactor{12} │
│ sqft_above    │ Int64                                              │ Count             │
│ sqft_basement │ Int64                                              │ Count             │
│ yr_built      │ Int64                                              │ Count             │
│ yr_renovated  │ CategoricalArrays.CategoricalValue{Int64,UInt32}   │ OrderedFactor{70} │
│ zipcode       │ CategoricalArrays.CategoricalValue{Int64,UInt32}   │ Multiclass{70}    │
│ lat           │ Float64                                            │ Continuous        │
│ long          │ Float64                                            │ Continuous        │
│ sqft_living15 │ Int64                                              │ Count             │
│ sqft_lot15    │ Int64                                              │ Count             │
│ isrenovated   │ CategoricalArrays.CategoricalValue{Bool,UInt32}    │ OrderedFactor{2}  │
│ has_basement  │ CategoricalArrays.CategoricalValue{Bool,UInt32}    │ OrderedFactor{2}  │
└───────────────┴────────────────────────────────────────────────────┴───────────────────┘
_.nrows = 21613

Let's also rescale the column price to be in 1000s of dollars:

df.price = df.price ./ 1000;

For simplicity let's just drop a few additional columns that don't seem to matter much:

select!(df, Not([:yr_renovated, :sqft_basement, :zipcode]));

Basic data visualisation

Let's plot a basic histogram of the prices to get an idea for the distribution:

plt.figure(figsize=(8,6))
plt.hist(df.price, color = "blue", edgecolor = "white", bins=50,
         density=true, alpha=0.5)
plt.xlabel("Price", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
Histogram of the prices

Let's see if there's a difference between renovated and unrenovated flats:

plt.figure(figsize=(8,6))
plt.hist(df.price[df.isrenovated .== true], color="blue", density=true,
        edgecolor="white", bins=50, label="renovated", alpha=0.5)
plt.hist(df.price[df.isrenovated .== false], color="red", density=true,
        edgecolor="white", bins=50, label="unrenovated", alpha=0.5)
plt.xlabel("Price", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.legend(fontsize=12)
Histogram of the prices depending on renovation We can observe that renovated flats seem to achieve higher sales values, and this might thus be a relevant feature.

Likewise, this could be done to verify that condition, waterfront etc are important features.

Fitting a first model

@load DecisionTreeRegressor

y, X = unpack(df, ==(:price), col -> true)
train, test = partition(eachindex(y), 0.7, shuffle=true, rng=5)

tree = machine(DecisionTreeRegressor(), X, y)

fit!(tree, rows=train);

Let's see how it does

rms(y[test], predict(tree, rows=test))
179.7653629352114

Let's try to do better.

Random forest model

We might be able to improve upon the RMSE using more powerful learners.

@load RandomForestRegressor pkg=ScikitLearn
RandomForestRegressor(
    n_estimators = 100,
    criterion = "mse",
    max_depth = nothing,
    min_samples_split = 2,
    min_samples_leaf = 1,
    min_weight_fraction_leaf = 0.0,
    max_features = "auto",
    max_leaf_nodes = nothing,
    min_impurity_decrease = 0.0,
    bootstrap = true,
    oob_score = false,
    n_jobs = nothing,
    random_state = nothing,
    verbose = 0,
    warm_start = false) @062

That model only accepts input in the form of Count and so we have to coerce all Finite types into Count:

coerce!(X, Finite => Count);

Now we can fit

rf_mdl = RandomForestRegressor()
rf = machine(rf_mdl, X, y)
fit!(rf, rows=train)

rms(y[test], predict(rf, rows=test))
138.15321939385157

A bit better but it would be best to check this a bit more carefully:

cv3 = CV(; nfolds=3)
res = evaluate(rf_mdl, X, y, resampling=CV(shuffle=true),
               measure=rms, verbosity=0)
┌───────────┬───────────────┬────────────────────────────────────────────┐
│ _.measure │ _.measurement │ _.per_fold                                 │
├───────────┼───────────────┼────────────────────────────────────────────┤
│ rms       │ 134.0         │ [121.0, 139.0, 131.0, 134.0, 131.0, 146.0] │
└───────────┴───────────────┴────────────────────────────────────────────┘
_.per_observation = [missing]

GBM

Let's try a different kind of model: Gradient Boosted Decision Trees from the package xgboost and we'll try to tune it too.

@load XGBoostRegressor
XGBoostRegressor(
    num_round = 100,
    booster = "gbtree",
    disable_default_eval_metric = 0,
    eta = 0.3,
    gamma = 0.0,
    max_depth = 6,
    min_child_weight = 1.0,
    max_delta_step = 0.0,
    subsample = 1.0,
    colsample_bytree = 1.0,
    colsample_bylevel = 1.0,
    lambda = 1.0,
    alpha = 0.0,
    tree_method = "auto",
    sketch_eps = 0.03,
    scale_pos_weight = 1.0,
    updater = "auto",
    refresh_leaf = 1,
    process_type = "default",
    grow_policy = "depthwise",
    max_leaves = 0,
    max_bin = 256,
    predictor = "cpu_predictor",
    sample_type = "uniform",
    normalize_type = "tree",
    rate_drop = 0.0,
    one_drop = 0,
    skip_drop = 0.0,
    feature_selector = "cyclic",
    top_k = 0,
    tweedie_variance_power = 1.5,
    objective = "reg:linear",
    base_score = 0.5,
    eval_metric = "rmse",
    seed = 0) @435

It expects a Table(Continuous) input so we need to coerce X again:

coerce!(X, Count => Continuous)

xgb  = XGBoostRegressor()
xgbm = machine(xgb, X, y)
fit!(xgbm, rows=train)

rms(y[test], predict(xgbm, rows=test))
137.35137881292272

Let's try to tune it, first we define ranges for a number of useful parameters:

r1 = range(xgb, :max_depth, lower=3, upper=10)
r2 = range(xgb, :num_round, lower=1, upper=25);

And now we tune, we use a very coarse resolution because we use so many ranges, 2^7 is already some 128 models...

tm = TunedModel(model=xgb, tuning=Grid(resolution=7),
                resampling=CV(rng=11), ranges=[r1,r2],
                measure=rms)
mtm = machine(tm, X, y)
fit!(mtm, rows=train)

rms(y[test], predict(mtm, rows=test))
140.91453933000128

Tuning helps a fair bit!