Preprocess data for analysis and visualization.
Usage
preprocess(x, config, ...)
preprocess.class_tabular.PreprocessorConfig(
x,
config,
dat_validation = NULL,
dat_test = NULL,
verbosity = 1L
)
preprocess.class_tabular.Preprocessor(x, config, verbosity = 1L)Arguments
- x
data.frame, data.table, tbl_df (tabular data): Data to be preprocessed.
- config
PreprocessorConfig: Setup using setup_Preprocessor ORPreprocessorobject: Output of previous run ofpreprocess. This allows, for example, applying preprocessing to a validation or test set using the same parameters as were used for the training set. In particular, the same scale centers and coefficients will be applied to the new data.- ...
Not used.
- dat_validation
tabular data: Validation set data.
- dat_test
tabular data: Test set data.
- verbosity
Integer: Verbosity level.
Details
Methods are provided for preprocessing training set data, which accepts a PreprocessorConfig
object, and for preprocessing validation and test set data, which accept a Preprocessor
object.
Examples
# Setup a `Preprocessor`: this outputs a `PreprocessorConfig` object.
prp <- setup_Preprocessor(remove_duplicates = TRUE, scale = TRUE, center = TRUE)
# Includes a long list of parameters
prp
#> <PreprocessorConfig>
#> complete_cases: <lgc> FALSE
#> remove_features_thres: <NUL> NULL
#> remove_cases_thres: <NUL> NULL
#> missingness: <lgc> FALSE
#> impute: <lgc> FALSE
#> impute_type: <chr> missRanger
#> impute_missRanger_params:
#> pmm.k: <nmr> 3.00
#> maxiter: <nmr> 10.00
#> num.trees: <nmr> 500.00
#> impute_discrete: <chr> get_mode
#> impute_continuous: <chr> mean
#> integer2factor: <lgc> FALSE
#> integer2numeric: <lgc> FALSE
#> logical2factor: <lgc> FALSE
#> logical2numeric: <lgc> FALSE
#> numeric2factor: <lgc> FALSE
#> numeric2factor_levels: <NUL> NULL
#> numeric_cut_n: <nmr> 0.00
#> numeric_cut_labels: <lgc> FALSE
#> numeric_quant_n: <nmr> 0.00
#> numeric_quant_NAonly: <lgc> FALSE
#> unique_len2factor: <nmr> 0.00
#> character2factor: <lgc> FALSE
#> factorNA2missing: <lgc> FALSE
#> factorNA2missing_level: <chr> missing
#> factor2integer: <lgc> FALSE
#> factor2integer_startat0: <lgc> TRUE
#> scale: <lgc> TRUE
#> center: <lgc> TRUE
#> scale_centers: <NUL> NULL
#> scale_coefficients: <NUL> NULL
#> remove_constants: <lgc> FALSE
#> remove_constants_skip_missing: <lgc> TRUE
#> remove_duplicates: <lgc> TRUE
#> remove_features: <NUL> NULL
#> one_hot: <lgc> FALSE
#> one_hot_levels: <NUL> NULL
#> add_date_features: <lgc> FALSE
#> date_features: <chr> weekday, month, year
#> add_holidays: <lgc> FALSE
#> exclude: <NUL> NULL
# Resample iris to get train and test data
res <- resample(iris, setup_Resampler(seed = 2026))
#> 2026-02-22 18:59:31
#> Input contains more than one column; stratifying on last.
#> [resample]
#> 2026-02-22 18:59:31
#> Using max n bins possible = 3.
#> [kfold]
iris_train <- iris[res[[1]], ]
iris_test <- iris[-res[[1]], ]
# Preprocess training data
iris_pre <- preprocess(iris_train, prp)
#> 2026-02-22 18:59:31
#> Removing 1 duplicate case...
#> [preprocess]
#> 2026-02-22 18:59:31
#> Scaling and centering 4 numeric features...
#> [preprocess]
#> 2026-02-22 18:59:31
#> Preprocessing done.
#> [preprocess]
# Access preprocessd training data with `preprocessed()`
preprocessed(iris_pre)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 -0.87028640 1.0040625 -1.32659450 -1.29261555 setosa
#> 2 -1.11148263 -0.1115625 -1.32659450 -1.29261555 setosa
#> 3 -1.35267886 0.3346875 -1.38360658 -1.29261555 setosa
#> 4 -1.47327698 0.1115625 -1.26958242 -1.29261555 setosa
#> 5 -0.99088452 1.2271875 -1.32659450 -1.29261555 setosa
#> 6 -0.50849205 1.8965625 -1.15555826 -1.03136293 setosa
#> 7 -1.47327698 0.7809375 -1.32659450 -1.16198924 setosa
#> 9 -1.71447321 -0.3346875 -1.32659450 -1.29261555 setosa
#> 10 -1.11148263 0.1115625 -1.26958242 -1.42324186 setosa
#> 11 -0.50849205 1.4503125 -1.26958242 -1.29261555 setosa
#> 12 -1.23208075 0.7809375 -1.21257034 -1.29261555 setosa
#> 13 -1.23208075 -0.1115625 -1.32659450 -1.42324186 setosa
#> 14 -1.83507133 -0.1115625 -1.49763073 -1.42324186 setosa
#> 15 -0.02609959 2.1196875 -1.44061866 -1.29261555 setosa
#> 16 -0.14669771 3.0121875 -1.26958242 -1.03136293 setosa
#> 17 -0.50849205 1.8965625 -1.38360658 -1.03136293 setosa
#> 18 -0.87028640 1.0040625 -1.32659450 -1.16198924 setosa
#> 19 -0.14669771 1.6734375 -1.15555826 -1.16198924 setosa
#> 20 -0.87028640 1.6734375 -1.26958242 -1.16198924 setosa
#> 21 -0.50849205 0.7809375 -1.15555826 -1.29261555 setosa
#> 22 -0.87028640 1.4503125 -1.26958242 -1.03136293 setosa
#> 23 -1.47327698 1.2271875 -1.55464281 -1.29261555 setosa
#> 25 -1.23208075 0.7809375 -1.04153410 -1.29261555 setosa
#> 26 -0.99088452 -0.1115625 -1.21257034 -1.29261555 setosa
#> 27 -0.99088452 0.7809375 -1.21257034 -1.03136293 setosa
#> 28 -0.74968829 1.0040625 -1.26958242 -1.29261555 setosa
#> 30 -1.35267886 0.3346875 -1.21257034 -1.29261555 setosa
#> 31 -1.23208075 0.1115625 -1.21257034 -1.29261555 setosa
#> 33 -0.74968829 2.3428125 -1.26958242 -1.42324186 setosa
#> 34 -0.38789394 2.5659375 -1.32659450 -1.29261555 setosa
#> 35 -1.11148263 0.1115625 -1.26958242 -1.29261555 setosa
#> 36 -0.99088452 0.3346875 -1.44061866 -1.29261555 setosa
#> 37 -0.38789394 1.0040625 -1.38360658 -1.29261555 setosa
#> 38 -1.11148263 1.2271875 -1.32659450 -1.42324186 setosa
#> 39 -1.71447321 -0.1115625 -1.38360658 -1.29261555 setosa
#> 40 -0.87028640 0.7809375 -1.26958242 -1.29261555 setosa
#> 41 -0.99088452 1.0040625 -1.38360658 -1.16198924 setosa
#> 42 -1.59387510 -1.6734375 -1.38360658 -1.16198924 setosa
#> 43 -1.71447321 0.3346875 -1.38360658 -1.29261555 setosa
#> 44 -0.99088452 1.0040625 -1.21257034 -0.77011032 setosa
#> 45 -0.87028640 1.6734375 -1.04153410 -1.03136293 setosa
#> 46 -1.23208075 -0.1115625 -1.32659450 -1.16198924 setosa
#> 48 -1.47327698 0.3346875 -1.32659450 -1.29261555 setosa
#> 49 -0.62909017 1.4503125 -1.26958242 -1.29261555 setosa
#> 50 -0.99088452 0.5578125 -1.32659450 -1.29261555 setosa
#> 51 1.42107780 0.3346875 0.55480411 0.27490014 versicolor
#> 53 1.30047968 0.1115625 0.66882827 0.40552645 versicolor
#> 54 -0.38789394 -1.6734375 0.15571956 0.14427383 versicolor
#> 55 0.81808722 -0.5578125 0.49779203 0.40552645 versicolor
#> 56 -0.14669771 -0.5578125 0.44077995 0.14427383 versicolor
#> 57 0.57689099 0.5578125 0.55480411 0.53615275 versicolor
#> 58 -1.11148263 -1.4503125 -0.24336499 -0.24760509 versicolor
#> 59 0.93868533 -0.3346875 0.49779203 0.14427383 versicolor
#> 60 -0.74968829 -0.7809375 0.09870748 0.27490014 versicolor
#> 61 -0.99088452 -2.3428125 -0.12934084 -0.24760509 versicolor
#> 63 0.21509664 -1.8965625 0.15571956 -0.24760509 versicolor
#> 64 0.33569475 -0.3346875 0.55480411 0.27490014 versicolor
#> 66 1.05928345 0.1115625 0.38376788 0.27490014 versicolor
#> 67 -0.26729582 -0.1115625 0.44077995 0.40552645 versicolor
#> 68 -0.02609959 -0.7809375 0.21273164 -0.24760509 versicolor
#> 69 0.45629287 -1.8965625 0.44077995 0.40552645 versicolor
#> 70 -0.26729582 -1.2271875 0.09870748 -0.11697878 versicolor
#> 71 0.09449852 0.3346875 0.61181619 0.79740537 versicolor
#> 72 0.33569475 -0.5578125 0.15571956 0.14427383 versicolor
#> 73 0.57689099 -1.2271875 0.66882827 0.40552645 versicolor
#> 74 0.33569475 -0.5578125 0.55480411 0.01364752 versicolor
#> 75 0.69748910 -0.3346875 0.32675580 0.14427383 versicolor
#> 76 0.93868533 -0.1115625 0.38376788 0.27490014 versicolor
#> 77 1.17988156 -0.5578125 0.61181619 0.27490014 versicolor
#> 79 0.21509664 -0.3346875 0.44077995 0.40552645 versicolor
#> 80 -0.14669771 -1.0040625 -0.12934084 -0.24760509 versicolor
#> 81 -0.38789394 -1.4503125 0.04169540 -0.11697878 versicolor
#> 82 -0.38789394 -1.4503125 -0.01531668 -0.24760509 versicolor
#> 83 -0.02609959 -0.7809375 0.09870748 0.01364752 versicolor
#> 84 0.21509664 -0.7809375 0.78285243 0.53615275 versicolor
#> 85 -0.50849205 -0.1115625 0.44077995 0.40552645 versicolor
#> 86 0.21509664 0.7809375 0.44077995 0.53615275 versicolor
#> 87 1.05928345 0.1115625 0.55480411 0.40552645 versicolor
#> 88 0.57689099 -1.6734375 0.38376788 0.14427383 versicolor
#> 89 -0.26729582 -0.1115625 0.21273164 0.14427383 versicolor
#> 90 -0.38789394 -1.2271875 0.15571956 0.14427383 versicolor
#> 91 -0.38789394 -1.0040625 0.38376788 0.01364752 versicolor
#> 92 0.33569475 -0.1115625 0.49779203 0.27490014 versicolor
#> 93 -0.02609959 -1.0040625 0.15571956 0.01364752 versicolor
#> 94 -0.99088452 -1.6734375 -0.24336499 -0.24760509 versicolor
#> 95 -0.26729582 -0.7809375 0.26974372 0.14427383 versicolor
#> 96 -0.14669771 -0.1115625 0.26974372 0.01364752 versicolor
#> 98 0.45629287 -0.3346875 0.32675580 0.14427383 versicolor
#> 99 -0.87028640 -1.2271875 -0.41440123 -0.11697878 versicolor
#> 100 -0.14669771 -0.5578125 0.21273164 0.14427383 versicolor
#> 101 0.57689099 0.5578125 1.29596114 1.71178952 virginica
#> 102 -0.02609959 -0.7809375 0.78285243 0.92803168 virginica
#> 103 1.54167591 -0.1115625 1.23894906 1.18928429 virginica
#> 104 0.57689099 -0.3346875 1.06791282 0.79740537 virginica
#> 105 0.81808722 -0.1115625 1.18193698 1.31991060 virginica
#> 106 2.14466649 -0.1115625 1.63803362 1.18928429 virginica
#> 107 -1.11148263 -1.2271875 0.44077995 0.66677906 virginica
#> 109 1.05928345 -1.2271875 1.18193698 0.79740537 virginica
#> 110 1.66227403 1.2271875 1.35297322 1.71178952 virginica
#> 111 0.81808722 0.3346875 0.78285243 1.05865798 virginica
#> 112 0.69748910 -0.7809375 0.89687659 0.92803168 virginica
#> 113 1.17988156 -0.1115625 1.01090075 1.18928429 virginica
#> 114 -0.14669771 -1.2271875 0.72584035 1.05865798 virginica
#> 115 -0.02609959 -0.5578125 0.78285243 1.58116321 virginica
#> 116 0.69748910 0.3346875 0.89687659 1.45053691 virginica
#> 117 0.81808722 -0.1115625 1.01090075 0.79740537 virginica
#> 118 2.26526461 1.6734375 1.69504569 1.31991060 virginica
#> 119 2.26526461 -1.0040625 1.80906985 1.45053691 virginica
#> 120 0.21509664 -1.8965625 0.72584035 0.40552645 virginica
#> 121 1.30047968 0.3346875 1.12492490 1.45053691 virginica
#> 122 -0.26729582 -0.5578125 0.66882827 1.05865798 virginica
#> 124 0.57689099 -0.7809375 0.66882827 0.79740537 virginica
#> 126 1.66227403 0.3346875 1.29596114 0.79740537 virginica
#> 127 0.45629287 -0.5578125 0.61181619 0.79740537 virginica
#> 128 0.33569475 -0.1115625 0.66882827 0.79740537 virginica
#> 129 0.69748910 -0.5578125 1.06791282 1.18928429 virginica
#> 130 1.66227403 -0.1115625 1.18193698 0.53615275 virginica
#> 131 1.90347026 -0.5578125 1.35297322 0.92803168 virginica
#> 132 2.50646084 1.6734375 1.52400946 1.05865798 virginica
#> 133 0.69748910 -0.5578125 1.06791282 1.31991060 virginica
#> 134 0.57689099 -0.5578125 0.78285243 0.40552645 virginica
#> 136 2.26526461 -0.1115625 1.35297322 1.45053691 virginica
#> 137 0.57689099 0.7809375 1.06791282 1.58116321 virginica
#> 138 0.69748910 0.1115625 1.01090075 0.79740537 virginica
#> 139 0.21509664 -0.1115625 0.61181619 0.79740537 virginica
#> 140 1.30047968 0.1115625 0.95388867 1.18928429 virginica
#> 141 1.05928345 0.1115625 1.06791282 1.58116321 virginica
#> 142 1.30047968 0.1115625 0.78285243 1.45053691 virginica
#> 144 1.17988156 0.3346875 1.23894906 1.45053691 virginica
#> 146 1.05928345 -0.1115625 0.83986451 1.45053691 virginica
#> 147 0.57689099 -1.2271875 0.72584035 0.92803168 virginica
#> 148 0.81808722 -0.1115625 0.83986451 1.05865798 virginica
#> 149 0.45629287 0.7809375 0.95388867 1.45053691 virginica
#> 150 0.09449852 -0.1115625 0.78285243 0.79740537 virginica
# Apply the same preprocessing to test data
# In this case, the scale and center values from training data will be used.
# Note how `preprocess()` accepts either a `PreprocessorConfig` or `Preprocessor` object for
# this reason.
iris_test_pre <- preprocess(iris_test, iris_pre)
#> 2026-02-22 18:59:31
#> Scaling and centering 4 numeric features...
#> [preprocess]
#> 2026-02-22 18:59:31
#> Preprocessing done.
#> [preprocess]
# Access preprocessed test data
preprocessed(iris_test_pre)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 8 -0.99088452 0.7809375 -1.26958242 -1.2926155 setosa
#> 24 -0.87028640 0.5578125 -1.15555826 -0.9007366 setosa
#> 29 -0.74968829 0.7809375 -1.32659450 -1.2926155 setosa
#> 32 -0.50849205 0.7809375 -1.26958242 -1.0313629 setosa
#> 47 -0.87028640 1.6734375 -1.21257034 -1.2926155 setosa
#> 52 0.69748910 0.3346875 0.44077995 0.4055264 versicolor
#> 62 0.09449852 -0.1115625 0.26974372 0.4055264 versicolor
#> 65 -0.26729582 -0.3346875 -0.07232876 0.1442738 versicolor
#> 78 1.05928345 -0.1115625 0.72584035 0.6667791 versicolor
#> 97 -0.14669771 -0.3346875 0.26974372 0.1442738 versicolor
#> 108 1.78287214 -0.3346875 1.46699738 0.7974054 virginica
#> 123 2.26526461 -0.5578125 1.69504569 1.0586580 virginica
#> 125 1.05928345 0.5578125 1.12492490 1.1892843 virginica
#> 135 0.33569475 -1.0040625 1.06791282 0.2749001 virginica
#> 145 1.05928345 0.5578125 1.12492490 1.7117895 virginica