Requirement

  • numpy
  • pandas
  • sklearn
  • scipy
from __future__ import print_function

import os
import subprocess

import pandas as pd
import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction import DictVectorizer

Data

We use iris data as example


def get_iris_data():

    """Get the iris data, from local csv or pandas repo."""
    if os.path.exists("iris.csv"):
        print("-- iris.csv found locally")
        df = pd.read_csv("iris.csv", index_col=0)
    else:
        print("-- trying to download from github")
        fn = "https://raw.githubusercontent.com/pydata/pandas/" + \
             "master/pandas/tests/data/iris.csv"
        try:
            df = pd.read_csv(fn)
        except:
            exit("-- Unable to download iris.csv")

        with open("iris.csv", 'w') as f:
            print("-- writing to local iris.csv file")
            df.to_csv(f)

    return df

Encode_target

Because Decision Tree only accept number type, so we need to translate non-number(Category) target to integer. In Other words, use integer to represent the Class.

In this case: {'Iris-virginica': 2, 'Iris-setosa': 0, 'Iris-versicolor': 1}


def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    print(map_to_int)
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

Deal with non numeric features

The classifier(DT) just only deal with numeric features so we need to translate our features into that format using DictVectorizer.


def translate_non_numeric_features(df):
    """The classifier(DT) just only deal with numeric features
    so we need to translate our features into that format using DictVectorizer.

    Args
    ----
    df -- data frame with non numeric features
    """

    # translate dataframe to list of dicts
    list_dicts_of_df = df.T.to_dict().values()
    vec = DictVectorizer()
    new_features_list_dicts = vec.fit_transform(list_dicts_of_df).toarray()
    new_df = pd.DataFrame(new_features_list_dicts,
                          columns=vec.get_feature_names())

    return new_df

Random Parameter Search

Try to find the best parameters for Decision Tree.


def run_randomsearch(X, y, clf, para_dist, cv=5,
                     n_iter_search=20):
    """Run a random search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.
    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search)

    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter "
           "settings.").format((time() - start),
                               n_iter_search))

    top_params = report(random_search.grid_scores_, 3)
    return top_params


def report(grid_scores, n_top=3):
    """Report top n_top parameters settings, default n_top=3.

    Args
    ----
    grid_scores -- output from grid or random search
    n_top -- how many to report, of top models

    Returns
    -------
    top_params -- [dict] top parameter settings found in
                  search
    """
    top_scores = sorted(grid_scores,
                        key=itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
            score.mean_validation_score,
            np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

    return top_scores[0].parameters

Visualize Tree

We built the Decision Tree model, one of benefit is we can visualize model in Tree.

def visualize_tree(tree, feature_names, fn="dt"):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn Decision Tree.
    feature_names -- list of feature names.
    fn -- [string], root of filename, default `dt`.
    """
    dotfile = fn + ".dot"
    pngfile = fn + ".png"

    with open(dotfile, 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", dotfile, "-o", pngfile]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, "
             "to produce visualization")

Predict

# predict class
clf.predict([5, 1.2, 4.3, 2.2])
# predict class with probility
clf.predict_proba([5.2, 1.2, 3.3, 1.2])

Main

if __name__ == "__main__":

    df = get_iris_data()
    print("* df.head()", df.head(), sep="\n", end="\n\n")
    print("* df.tail()", df.tail(), sep="\n", end="\n\n")
    print("* iris types:", df["Name"].unique(), sep="\n")

    df2, targets = encode_target(df, "Name")
    print("\n")
    print("* df2.head()", df2[["Target", "Name"]].head(), sep="\n", end="\n\n")
    print("* df2.tail()", df2[["Target", "Name"]].tail(),
    sep="\n", end="\n\n")
    print("* targets", targets, sep="\n", end="\n\n")

    features = list(df2.columns[:4])
    print("* features:", features, sep="\n")

    y = df2["Target"]
    X = df2[features]
    dt_old = DecisionTreeClassifier(min_samples_split=20, random_state=99)
    dt_old.fit(X, y)
    scores = cross_val_score(dt_old, X, y, cv=10)
    print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                                scores.std()),
        end="\n\n")
    #visualize_tree(dt, features)

    print("-- Random Parameter Search via 10-fold CV")

    # dict of parameter list/distributions to sample
    param_dist = {"criterion": ["gini", "entropy"],
                  "min_samples_split": randint(1, 20),
                  "max_depth": randint(5, 10),
                  "min_samples_leaf": randint(1, 20),
                  "max_leaf_nodes": randint(10, 20)}
    dt = DecisionTreeClassifier()
    ts_rs = run_randomsearch(X, y, dt, param_dist, cv=10,
                             n_iter_search=288)

    print("\nts_rs\n")
    print(ts_rs)

    print("\n-- Best Parameters:")
    for k, v in ts_rs.items():
        print("parameters: {:<20s} setting: {}".format(k, v))

    # test the retuned best parameters
    print("\n\n-- Testing best parameters [Random]...")
    dt_ts_rs = DecisionTreeClassifier(**ts_rs)
    scores = cross_val_score(dt_ts_rs, X, y, cv=10)
    print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                              scores.std()),
          end="\n\n")

    dt_ts_rs.fit(X, y)
    visualize_tree(dt_ts_rs, features, fn="rand_best")

    # predict the result
    print(dt_ts_rs.predict([5, 1.2, 4.3, 2.2]))
    print(dt_ts_rs.predict_proba([5, 1.2, 4.3, 2.2]))

Reference