## Hyper Parameters
`n_estimators` - use this if you are working with a large dataset and having fewer estimators will enable you to build a model faster and iterate to learn about the data
`min_samples_leaf` - its the minimum number of samples at the last leaf node in every tree. default is to have 1 node. you can increase this number to 3,5,10,25,etc to have fewer layers at the end. Especially good for regression models with a lot of data
`max_features` - if max_features is 0.5 only half the columns are used at every split. Default is to use all of them. It is especially useful if one column is dominating and all trees pick that. So there is less variety in the trees.
`n_jobs` - not an hyperparameter. its the number of cores to use for training -1 uses everything.
Convert strings to categories
```python
def train_cats(df):
"""Change any columns of strings in a panda's dataframe to a column of
categorical values. This applies the changes inplace.
"""
for n,c in df.items():
if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
def apply_cats(df, trn):
"""Changes any columns of strings in df into categorical variables using trn as
a template for the category codes.
"""
for n,c in df.items():
if (n in trn.columns) and (trn[n].dtype.name=='category'):
df[n] = c.astype('category').cat.as_ordered()
df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True)
```
Percentage of null values per column
```python
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))
```
dataframe save it as feather format to load and reload for fast save and load
```python
# AUTOGENERATED! DO NOT EDIT! File to edit: notebooks/01_structured.ipynb (unless otherwise specified).
__all__ = ['draw_tree', 'get_sample', 'add_datepart', 'is_date', 'train_cats', 'apply_cats', 'fix_missing',
'numericalize', 'scale_vars', 'proc_df', 'rf_feat_importance']
# Cell
# Copyright 2017 Jeremy Howard
# Copyright 2020 Lewis Tunstall
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file has been modified by Lewis Tunstall to be included in an nbdev development environment.
# In particular, the docstrings have been trimmed to match the form expected in the Jekyll documentation.
# The following functions have been removed from the original file:
#
# - set_plot_sizes
# - parallel_trees
# - combine_date
# - set_rf_samples
# - reset_rf_samples
# - get_nn_mappers
#
# Docstrings have been added to the following functions:
#
# - scale_vars
# - rf_feat_importance
# Cell
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from sklearn.tree import export_graphviz
import IPython, graphviz
import re
from nbdev.showdoc import *
import pandas as pd
# Cell
def draw_tree(t, df, size=10, ratio=0.6, precision=0):
"""Draws a representation of a random forest in IPython."""
s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
special_characters=True, rotate=True, precision=precision)
IPython.display.display(graphviz.Source(re.sub('Tree {',
f'Tree {{ size={size}; ratio={ratio}', s)))
# Cell
def get_sample(df,n):
""" Gets a random sample of n rows from df, without replacement.
"""
idxs = sorted(np.random.permutation(len(df))[:n])
return df.iloc[idxs].copy()
# Cell
def add_datepart(df, fldnames, drop=True, time=False, errors="raise"):
"""add_datepart converts a column of df from a datetime64 to many columns containing
the information from the date. This applies changes inplace."""
if isinstance(fldnames,str):
fldnames = [fldnames]
for fldname in fldnames:
fld = df[fldname]
fld_dtype = fld.dtype
if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
fld_dtype = np.datetime64
if not np.issubdtype(fld_dtype, np.datetime64):
df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
targ_pre = re.sub('[Dd]ate