Check Data
Usage
check_data(
x,
name = NULL,
get_duplicates = TRUE,
get_na_case_pct = FALSE,
get_na_feature_pct = FALSE
)Examples
n <- 1000
x <- rnormmat(n, 50, return_df = TRUE)
x$char1 <- sample(letters, n, TRUE)
x$char2 <- sample(letters, n, TRUE)
x$fct <- factor(sample(letters, n, TRUE))
x <- rbind(x, x[1, ])
x$const <- 99L
x[sample(nrow(x), 20), 3] <- NA
x[sample(nrow(x), 20), 10] <- NA
x$fct[30:35] <- NA
check_data(x)
#> x: A data.table with 1001 rows and 54 columns.
#>
#> Data types
#> * 50 numeric features
#> * 1 integer feature
#> * 1 factor, which is not ordered
#> * 2 character features
#> * 0 date features
#>
#> Issues
#> * 1 constant feature
#> * 1 duplicate case
#> * 3 features include 'NA' values; 46 'NA' values total
#> * 1 factor; 2 numeric
#>
#> Recommendations
#> * Consider converting character features to factors or excluding them.
#> * Remove the constant feature.
#> * Consider removing the duplicate case.
#> * Consider using algorithms that can handle missingness or imputing missing values.