Random Forest - Vivek's Digital Garden

## Hyper Parameters `n_estimators` - use this if you are working with a large dataset and having fewer estimators will enable you to build a model faster and iterate to learn about the data `min_samples_leaf` - its the minimum number of samples at the last leaf node in every tree. default is to have 1 node. you can increase this number to 3,5,10,25,etc to have fewer layers at the end. Especially good for regression models with a lot of data `max_features` - if max_features is 0.5 only half the columns are used at every split. Default is to use all of them. It is especially useful if one column is dominating and all trees pick that. So there is less variety in the trees. `n_jobs` - not an hyperparameter. its the number of cores to use for training -1 uses everything. Convert strings to categories ```python def train_cats(df): """Change any columns of strings in a panda's dataframe to a column of categorical values. This applies the changes inplace. """ for n,c in df.items(): if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered() def apply_cats(df, trn): """Changes any columns of strings in df into categorical variables using trn as a template for the category codes. """ for n,c in df.items(): if (n in trn.columns) and (trn[n].dtype.name=='category'): df[n] = c.astype('category').cat.as_ordered() df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True) ``` Percentage of null values per column ```python display_all(df_raw.isnull().sum().sort_index()/len(df_raw)) ``` dataframe save it as feather format to load and reload for fast save and load ```python # AUTOGENERATED! DO NOT EDIT! File to edit: notebooks/01_structured.ipynb (unless otherwise specified). __all__ = ['draw_tree', 'get_sample', 'add_datepart', 'is_date', 'train_cats', 'apply_cats', 'fix_missing', 'numericalize', 'scale_vars', 'proc_df', 'rf_feat_importance'] # Cell # Copyright 2017 Jeremy Howard # Copyright 2020 Lewis Tunstall # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This file has been modified by Lewis Tunstall to be included in an nbdev development environment. # In particular, the docstrings have been trimmed to match the form expected in the Jekyll documentation. # The following functions have been removed from the original file: # # - set_plot_sizes # - parallel_trees # - combine_date # - set_rf_samples # - reset_rf_samples # - get_nn_mappers # # Docstrings have been added to the following functions: # # - scale_vars # - rf_feat_importance # Cell from sklearn_pandas import DataFrameMapper from sklearn.preprocessing import LabelEncoder, StandardScaler from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype from sklearn.tree import export_graphviz import IPython, graphviz import re from nbdev.showdoc import * import pandas as pd # Cell def draw_tree(t, df, size=10, ratio=0.6, precision=0): """Draws a representation of a random forest in IPython.""" s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, special_characters=True, rotate=True, precision=precision) IPython.display.display(graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))) # Cell def get_sample(df,n): """ Gets a random sample of n rows from df, without replacement. """ idxs = sorted(np.random.permutation(len(df))[:n]) return df.iloc[idxs].copy() # Cell def add_datepart(df, fldnames, drop=True, time=False, errors="raise"): """add_datepart converts a column of df from a datetime64 to many columns containing the information from the date. This applies changes inplace.""" if isinstance(fldnames,str): fldnames = [fldnames] for fldname in fldnames: fld = df[fldname] fld_dtype = fld.dtype if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): fld_dtype = np.datetime64 if not np.issubdtype(fld_dtype, np.datetime64): df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors) targ_pre = re.sub('[Dd]ate, '', fldname) attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'] if time: attr = attr + ['Hour', 'Minute', 'Second'] for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower()) df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9 if drop: df.drop(fldname, axis=1, inplace=True) def is_date(x): return np.issubdtype(x.dtype, np.datetime64) # Cell def train_cats(df): """Change any columns of strings in a panda's dataframe to a column of categorical values. This applies the changes inplace. """ for n,c in df.items(): if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered() # Cell def apply_cats(df, trn): """Changes any columns of strings in df into categorical variables using trn as a template for the category codes. """ for n,c in df.items(): if (n in trn.columns) and (trn[n].dtype.name=='category'): df[n] = c.astype('category').cat.as_ordered() df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True) # Cell def fix_missing(df, col, name, na_dict): """ Fill missing data in a column of df with the median, and add a {name}_na column which specifies if the data was missing. """ if is_numeric_dtype(col): if pd.isnull(col).sum() or (name in na_dict): df[name+'_na'] = pd.isnull(col) filler = na_dict[name] if name in na_dict else col.median() df[name] = col.fillna(filler) na_dict[name] = filler return na_dict # Cell def numericalize(df, col, name, max_n_cat): """ Changes the column col from a categorical type to it's integer codes. """ if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat): df[name] = pd.Categorical(col).codes+1 # Cell def scale_vars(df, mapper): """ Standardize numerical features by removing the mean and scaling to unit variance. """ warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper # Cell def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None): """ proc_df takes a data frame df and splits off the response variable, and changes the df into an entirely numeric dataframe. For each column of df which is not in skip_flds nor in ignore_flds, na values are replaced by the median value of the column. """ if not ignore_flds: ignore_flds=[] if not skip_flds: skip_flds=[] if subset: df = get_sample(df,subset) else: df = df.copy() ignored_flds = df.loc[:, ignore_flds] df.drop(ignore_flds, axis=1, inplace=True) if preproc_fn: preproc_fn(df) if y_fld is None: y = None else: if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes y = df[y_fld].values skip_flds += [y_fld] df.drop(skip_flds, axis=1, inplace=True) if na_dict is None: na_dict = {} else: na_dict = na_dict.copy() na_dict_initial = na_dict.copy() for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict) if len(na_dict_initial.keys()) > 0: df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True) if do_scale: mapper = scale_vars(df, mapper) for n,c in df.items(): numericalize(df, c, n, max_n_cat) df = pd.get_dummies(df, dummy_na=True) df = pd.concat([ignored_flds, df], axis=1) res = [df, y, na_dict] if do_scale: res = res + [mapper] return res # Cell def rf_feat_importance(m, df): """ Create a pandas.DataFrame of feature importances. """ return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_} ).sort_values('imp', ascending=False) ```