Skip to contents

Preprocess data for analysis and visualization.

Usage

preprocess(x, config, ...)

preprocess.class_tabular.PreprocessorConfig(
  x,
  config,
  dat_validation = NULL,
  dat_test = NULL,
  verbosity = 1L
)

preprocess.class_tabular.Preprocessor(x, config, verbosity = 1L)

Arguments

x

data.frame, data.table, tbl_df (tabular data): Data to be preprocessed.

config

PreprocessorConfig: Setup using setup_Preprocessor OR Preprocessor object: Output of previous run of preprocess. This allows, for example, applying preprocessing to a validation or test set using the same parameters as were used for the training set. In particular, the same scale centers and coefficients will be applied to the new data.

...

Not used.

dat_validation

tabular data: Validation set data.

dat_test

tabular data: Test set data.

verbosity

Integer: Verbosity level.

Value

Preprocessor object.

Details

Methods are provided for preprocessing training set data, which accepts a PreprocessorConfig object, and for preprocessing validation and test set data, which accept a Preprocessor object.

Author

EDG

Examples

# Setup a `Preprocessor`: this outputs a `PreprocessorConfig` object.
prp <- setup_Preprocessor(remove_duplicates = TRUE, scale = TRUE, center = TRUE)

# Includes a long list of parameters
prp
#> <PreprocessorConfig>
#>                complete_cases: <lgc> FALSE
#>         remove_features_thres: <NUL> NULL
#>            remove_cases_thres: <NUL> NULL
#>                   missingness: <lgc> FALSE
#>                        impute: <lgc> FALSE
#>                   impute_type: <chr> missRanger
#>      impute_missRanger_params: 
#>                                    pmm.k: <nmr> 3.00
#>                                  maxiter: <nmr> 10.00
#>                                num.trees: <nmr> 500.00
#>               impute_discrete: <chr> get_mode
#>             impute_continuous: <chr> mean
#>                integer2factor: <lgc> FALSE
#>               integer2numeric: <lgc> FALSE
#>                logical2factor: <lgc> FALSE
#>               logical2numeric: <lgc> FALSE
#>                numeric2factor: <lgc> FALSE
#>         numeric2factor_levels: <NUL> NULL
#>                 numeric_cut_n: <nmr> 0.00
#>            numeric_cut_labels: <lgc> FALSE
#>               numeric_quant_n: <nmr> 0.00
#>          numeric_quant_NAonly: <lgc> FALSE
#>             unique_len2factor: <nmr> 0.00
#>              character2factor: <lgc> FALSE
#>              factorNA2missing: <lgc> FALSE
#>        factorNA2missing_level: <chr> missing
#>                factor2integer: <lgc> FALSE
#>       factor2integer_startat0: <lgc> TRUE
#>                         scale: <lgc> TRUE
#>                        center: <lgc> TRUE
#>                 scale_centers: <NUL> NULL
#>            scale_coefficients: <NUL> NULL
#>              remove_constants: <lgc> FALSE
#> remove_constants_skip_missing: <lgc> TRUE
#>             remove_duplicates: <lgc> TRUE
#>               remove_features: <NUL> NULL
#>                       one_hot: <lgc> FALSE
#>                one_hot_levels: <NUL> NULL
#>             add_date_features: <lgc> FALSE
#>                 date_features: <chr> weekday, month, year
#>                  add_holidays: <lgc> FALSE
#>                       exclude: <NUL> NULL

# Resample iris to get train and test data
res <- resample(iris, setup_Resampler(seed = 2026))
#> 2026-02-22 18:59:31 
#> Input contains more than one column; stratifying on last.
#>  [resample]
#> 2026-02-22 18:59:31 
#> Using max n bins possible = 3.
#>  [kfold]
iris_train <- iris[res[[1]], ]
iris_test <- iris[-res[[1]], ]

# Preprocess training data
iris_pre <- preprocess(iris_train, prp)
#> 2026-02-22 18:59:31 
#> Removing 1 duplicate case...
#>  [preprocess]
#> 2026-02-22 18:59:31 
#> Scaling and centering 4 numeric features...
#>  [preprocess]
#> 2026-02-22 18:59:31 
#> Preprocessing done.
#>  [preprocess]

# Access preprocessd training data with `preprocessed()`
preprocessed(iris_pre)
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1    -0.87028640   1.0040625  -1.32659450 -1.29261555     setosa
#> 2    -1.11148263  -0.1115625  -1.32659450 -1.29261555     setosa
#> 3    -1.35267886   0.3346875  -1.38360658 -1.29261555     setosa
#> 4    -1.47327698   0.1115625  -1.26958242 -1.29261555     setosa
#> 5    -0.99088452   1.2271875  -1.32659450 -1.29261555     setosa
#> 6    -0.50849205   1.8965625  -1.15555826 -1.03136293     setosa
#> 7    -1.47327698   0.7809375  -1.32659450 -1.16198924     setosa
#> 9    -1.71447321  -0.3346875  -1.32659450 -1.29261555     setosa
#> 10   -1.11148263   0.1115625  -1.26958242 -1.42324186     setosa
#> 11   -0.50849205   1.4503125  -1.26958242 -1.29261555     setosa
#> 12   -1.23208075   0.7809375  -1.21257034 -1.29261555     setosa
#> 13   -1.23208075  -0.1115625  -1.32659450 -1.42324186     setosa
#> 14   -1.83507133  -0.1115625  -1.49763073 -1.42324186     setosa
#> 15   -0.02609959   2.1196875  -1.44061866 -1.29261555     setosa
#> 16   -0.14669771   3.0121875  -1.26958242 -1.03136293     setosa
#> 17   -0.50849205   1.8965625  -1.38360658 -1.03136293     setosa
#> 18   -0.87028640   1.0040625  -1.32659450 -1.16198924     setosa
#> 19   -0.14669771   1.6734375  -1.15555826 -1.16198924     setosa
#> 20   -0.87028640   1.6734375  -1.26958242 -1.16198924     setosa
#> 21   -0.50849205   0.7809375  -1.15555826 -1.29261555     setosa
#> 22   -0.87028640   1.4503125  -1.26958242 -1.03136293     setosa
#> 23   -1.47327698   1.2271875  -1.55464281 -1.29261555     setosa
#> 25   -1.23208075   0.7809375  -1.04153410 -1.29261555     setosa
#> 26   -0.99088452  -0.1115625  -1.21257034 -1.29261555     setosa
#> 27   -0.99088452   0.7809375  -1.21257034 -1.03136293     setosa
#> 28   -0.74968829   1.0040625  -1.26958242 -1.29261555     setosa
#> 30   -1.35267886   0.3346875  -1.21257034 -1.29261555     setosa
#> 31   -1.23208075   0.1115625  -1.21257034 -1.29261555     setosa
#> 33   -0.74968829   2.3428125  -1.26958242 -1.42324186     setosa
#> 34   -0.38789394   2.5659375  -1.32659450 -1.29261555     setosa
#> 35   -1.11148263   0.1115625  -1.26958242 -1.29261555     setosa
#> 36   -0.99088452   0.3346875  -1.44061866 -1.29261555     setosa
#> 37   -0.38789394   1.0040625  -1.38360658 -1.29261555     setosa
#> 38   -1.11148263   1.2271875  -1.32659450 -1.42324186     setosa
#> 39   -1.71447321  -0.1115625  -1.38360658 -1.29261555     setosa
#> 40   -0.87028640   0.7809375  -1.26958242 -1.29261555     setosa
#> 41   -0.99088452   1.0040625  -1.38360658 -1.16198924     setosa
#> 42   -1.59387510  -1.6734375  -1.38360658 -1.16198924     setosa
#> 43   -1.71447321   0.3346875  -1.38360658 -1.29261555     setosa
#> 44   -0.99088452   1.0040625  -1.21257034 -0.77011032     setosa
#> 45   -0.87028640   1.6734375  -1.04153410 -1.03136293     setosa
#> 46   -1.23208075  -0.1115625  -1.32659450 -1.16198924     setosa
#> 48   -1.47327698   0.3346875  -1.32659450 -1.29261555     setosa
#> 49   -0.62909017   1.4503125  -1.26958242 -1.29261555     setosa
#> 50   -0.99088452   0.5578125  -1.32659450 -1.29261555     setosa
#> 51    1.42107780   0.3346875   0.55480411  0.27490014 versicolor
#> 53    1.30047968   0.1115625   0.66882827  0.40552645 versicolor
#> 54   -0.38789394  -1.6734375   0.15571956  0.14427383 versicolor
#> 55    0.81808722  -0.5578125   0.49779203  0.40552645 versicolor
#> 56   -0.14669771  -0.5578125   0.44077995  0.14427383 versicolor
#> 57    0.57689099   0.5578125   0.55480411  0.53615275 versicolor
#> 58   -1.11148263  -1.4503125  -0.24336499 -0.24760509 versicolor
#> 59    0.93868533  -0.3346875   0.49779203  0.14427383 versicolor
#> 60   -0.74968829  -0.7809375   0.09870748  0.27490014 versicolor
#> 61   -0.99088452  -2.3428125  -0.12934084 -0.24760509 versicolor
#> 63    0.21509664  -1.8965625   0.15571956 -0.24760509 versicolor
#> 64    0.33569475  -0.3346875   0.55480411  0.27490014 versicolor
#> 66    1.05928345   0.1115625   0.38376788  0.27490014 versicolor
#> 67   -0.26729582  -0.1115625   0.44077995  0.40552645 versicolor
#> 68   -0.02609959  -0.7809375   0.21273164 -0.24760509 versicolor
#> 69    0.45629287  -1.8965625   0.44077995  0.40552645 versicolor
#> 70   -0.26729582  -1.2271875   0.09870748 -0.11697878 versicolor
#> 71    0.09449852   0.3346875   0.61181619  0.79740537 versicolor
#> 72    0.33569475  -0.5578125   0.15571956  0.14427383 versicolor
#> 73    0.57689099  -1.2271875   0.66882827  0.40552645 versicolor
#> 74    0.33569475  -0.5578125   0.55480411  0.01364752 versicolor
#> 75    0.69748910  -0.3346875   0.32675580  0.14427383 versicolor
#> 76    0.93868533  -0.1115625   0.38376788  0.27490014 versicolor
#> 77    1.17988156  -0.5578125   0.61181619  0.27490014 versicolor
#> 79    0.21509664  -0.3346875   0.44077995  0.40552645 versicolor
#> 80   -0.14669771  -1.0040625  -0.12934084 -0.24760509 versicolor
#> 81   -0.38789394  -1.4503125   0.04169540 -0.11697878 versicolor
#> 82   -0.38789394  -1.4503125  -0.01531668 -0.24760509 versicolor
#> 83   -0.02609959  -0.7809375   0.09870748  0.01364752 versicolor
#> 84    0.21509664  -0.7809375   0.78285243  0.53615275 versicolor
#> 85   -0.50849205  -0.1115625   0.44077995  0.40552645 versicolor
#> 86    0.21509664   0.7809375   0.44077995  0.53615275 versicolor
#> 87    1.05928345   0.1115625   0.55480411  0.40552645 versicolor
#> 88    0.57689099  -1.6734375   0.38376788  0.14427383 versicolor
#> 89   -0.26729582  -0.1115625   0.21273164  0.14427383 versicolor
#> 90   -0.38789394  -1.2271875   0.15571956  0.14427383 versicolor
#> 91   -0.38789394  -1.0040625   0.38376788  0.01364752 versicolor
#> 92    0.33569475  -0.1115625   0.49779203  0.27490014 versicolor
#> 93   -0.02609959  -1.0040625   0.15571956  0.01364752 versicolor
#> 94   -0.99088452  -1.6734375  -0.24336499 -0.24760509 versicolor
#> 95   -0.26729582  -0.7809375   0.26974372  0.14427383 versicolor
#> 96   -0.14669771  -0.1115625   0.26974372  0.01364752 versicolor
#> 98    0.45629287  -0.3346875   0.32675580  0.14427383 versicolor
#> 99   -0.87028640  -1.2271875  -0.41440123 -0.11697878 versicolor
#> 100  -0.14669771  -0.5578125   0.21273164  0.14427383 versicolor
#> 101   0.57689099   0.5578125   1.29596114  1.71178952  virginica
#> 102  -0.02609959  -0.7809375   0.78285243  0.92803168  virginica
#> 103   1.54167591  -0.1115625   1.23894906  1.18928429  virginica
#> 104   0.57689099  -0.3346875   1.06791282  0.79740537  virginica
#> 105   0.81808722  -0.1115625   1.18193698  1.31991060  virginica
#> 106   2.14466649  -0.1115625   1.63803362  1.18928429  virginica
#> 107  -1.11148263  -1.2271875   0.44077995  0.66677906  virginica
#> 109   1.05928345  -1.2271875   1.18193698  0.79740537  virginica
#> 110   1.66227403   1.2271875   1.35297322  1.71178952  virginica
#> 111   0.81808722   0.3346875   0.78285243  1.05865798  virginica
#> 112   0.69748910  -0.7809375   0.89687659  0.92803168  virginica
#> 113   1.17988156  -0.1115625   1.01090075  1.18928429  virginica
#> 114  -0.14669771  -1.2271875   0.72584035  1.05865798  virginica
#> 115  -0.02609959  -0.5578125   0.78285243  1.58116321  virginica
#> 116   0.69748910   0.3346875   0.89687659  1.45053691  virginica
#> 117   0.81808722  -0.1115625   1.01090075  0.79740537  virginica
#> 118   2.26526461   1.6734375   1.69504569  1.31991060  virginica
#> 119   2.26526461  -1.0040625   1.80906985  1.45053691  virginica
#> 120   0.21509664  -1.8965625   0.72584035  0.40552645  virginica
#> 121   1.30047968   0.3346875   1.12492490  1.45053691  virginica
#> 122  -0.26729582  -0.5578125   0.66882827  1.05865798  virginica
#> 124   0.57689099  -0.7809375   0.66882827  0.79740537  virginica
#> 126   1.66227403   0.3346875   1.29596114  0.79740537  virginica
#> 127   0.45629287  -0.5578125   0.61181619  0.79740537  virginica
#> 128   0.33569475  -0.1115625   0.66882827  0.79740537  virginica
#> 129   0.69748910  -0.5578125   1.06791282  1.18928429  virginica
#> 130   1.66227403  -0.1115625   1.18193698  0.53615275  virginica
#> 131   1.90347026  -0.5578125   1.35297322  0.92803168  virginica
#> 132   2.50646084   1.6734375   1.52400946  1.05865798  virginica
#> 133   0.69748910  -0.5578125   1.06791282  1.31991060  virginica
#> 134   0.57689099  -0.5578125   0.78285243  0.40552645  virginica
#> 136   2.26526461  -0.1115625   1.35297322  1.45053691  virginica
#> 137   0.57689099   0.7809375   1.06791282  1.58116321  virginica
#> 138   0.69748910   0.1115625   1.01090075  0.79740537  virginica
#> 139   0.21509664  -0.1115625   0.61181619  0.79740537  virginica
#> 140   1.30047968   0.1115625   0.95388867  1.18928429  virginica
#> 141   1.05928345   0.1115625   1.06791282  1.58116321  virginica
#> 142   1.30047968   0.1115625   0.78285243  1.45053691  virginica
#> 144   1.17988156   0.3346875   1.23894906  1.45053691  virginica
#> 146   1.05928345  -0.1115625   0.83986451  1.45053691  virginica
#> 147   0.57689099  -1.2271875   0.72584035  0.92803168  virginica
#> 148   0.81808722  -0.1115625   0.83986451  1.05865798  virginica
#> 149   0.45629287   0.7809375   0.95388867  1.45053691  virginica
#> 150   0.09449852  -0.1115625   0.78285243  0.79740537  virginica

# Apply the same preprocessing to test data
# In this case, the scale and center values from training data will be used.
# Note how `preprocess()` accepts either a `PreprocessorConfig` or `Preprocessor` object for
# this reason.
iris_test_pre <- preprocess(iris_test, iris_pre)
#> 2026-02-22 18:59:31 
#> Scaling and centering 4 numeric features...
#>  [preprocess]
#> 2026-02-22 18:59:31 
#> Preprocessing done.
#>  [preprocess]

# Access preprocessed test data
preprocessed(iris_test_pre)
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 8    -0.99088452   0.7809375  -1.26958242  -1.2926155     setosa
#> 24   -0.87028640   0.5578125  -1.15555826  -0.9007366     setosa
#> 29   -0.74968829   0.7809375  -1.32659450  -1.2926155     setosa
#> 32   -0.50849205   0.7809375  -1.26958242  -1.0313629     setosa
#> 47   -0.87028640   1.6734375  -1.21257034  -1.2926155     setosa
#> 52    0.69748910   0.3346875   0.44077995   0.4055264 versicolor
#> 62    0.09449852  -0.1115625   0.26974372   0.4055264 versicolor
#> 65   -0.26729582  -0.3346875  -0.07232876   0.1442738 versicolor
#> 78    1.05928345  -0.1115625   0.72584035   0.6667791 versicolor
#> 97   -0.14669771  -0.3346875   0.26974372   0.1442738 versicolor
#> 108   1.78287214  -0.3346875   1.46699738   0.7974054  virginica
#> 123   2.26526461  -0.5578125   1.69504569   1.0586580  virginica
#> 125   1.05928345   0.5578125   1.12492490   1.1892843  virginica
#> 135   0.33569475  -1.0040625   1.06791282   0.2749001  virginica
#> 145   1.05928345   0.5578125   1.12492490   1.7117895  virginica