Tidyverse cheatsheet

Link: R

Remove . In Column Name function

## On-the-fly
colnames(df) <- gsub("\\.", " ", colnames(df))
 
## Function
makeColNamesUserFriendly <- function(ds) {
  # Convert any number of consecutive dots to a single space.
  names(ds) <- gsub(x = names(ds),
                    pattern = "(\\.)+",
                    replacement = " ")
 
  # Drop the trailing spaces.
  names(ds) <- gsub(x = names(ds),
                    pattern = "( )+$",
                    replacement = "")
  ds
}
 
## Usage
df <- makeColNamesUserFriendly(df)

Overlap value for multiple columns

Update: See my blog post Replace missing value from other columns using coalesce join in dplyr for an advanced version of coalesce function.

# Or match together a complete vector from missing pieces
y <- c(1, 2, NA, NA, 5)
z <- c(NA, NA, 3, 4, 5)
coalesce(y, z)
#> [1] 1 2 3 4 5
 
From <dplyr.tidyverse.org/reference/coalesce.html>

Replace value

mutate(sample = gsub("_x|\\.\\d+[A-Za-z]+", "", sample))
Replace_na()
 
Replace string if it contains certain value:
mutate(Owning_LOB = if_else(str_detect(Owning_LOB, "AMCB"), "AMCB - Commercial Banking", Owning_LOB))

Summarize and count

# count LOB
test <- master %>%
group_by(LOB) %>%
 summarize(count = n())
 
df2 <- confirmed %>% count(Owning_LOB)
 
test2 <- master %>% filter(LOB != "Unknown")
# Count only
 
n_distinct(confirmed$Owning_LOB)
 
count <- n_distinct(test$Application.Code)

Comparing row bind: base, dplyr and data.table

Comparing base::rbind vs dplyr::bind_rows vs data.table::rbindlist

TL;DR: data.table::rbindlist is the most powerful out of three.

base::rbind can cope some mixed data types but cannot handle missing columns.

dplyr::bind_rows can handle missing columns, but will fail to bind with mixed data types (e.g datetime to another) when there are some discrepancy.

data.table::rbindlist can ignore different data types when bind rows: (set use.names as FALSE to also ignore column names), such as:

all_inv <- data.table::rbindlist(list(last_inv_clean, this_inv), use.names = FALSE)

Get the max from sub group

Apply rules to LOR: Max > Min > PrivOnly

# Slice to keep the first row for duplicate apps
keepmax <- groupraw %>%
  group_by(Application.Code) %>%
  slice_max(Legal_Impact, n = 1)

Identify duplicate variables/columns

which(duplicated(names(this_df)))

Select columns

## Select last n columns:
tail(names(.), x)
 
## Select first and last 2 columns
select(1, tail(names(.), 2))

Relocate column

## Move column to the last
data %>% relocate(b, .after = last_col())

String detect

## From stringR
## If the first letter is AMCB:
filter(!str_extract(LOB, "^.{4}") == "AMCB")

Custom functions

Standardize column names as per template

##----------------------------------------------------------------
##                        col_dictionary()                       -
##                    standardize column names                   -
##----------------------------------------------------------------
## Read standardized column from a csv file, which contains old_name and new_name
col_dictionary <- read.csv("Data/cols_formatting.csv")
 
col_standardize <- function(data) {
  for(i in 1:nrow(col_dictionary))
    names(data)[names(data) == col_dictionary$old_name[i]] = col_dictionary$new_name[i]
  return(data)
  }

Clean export to excel

Add timestamp for export file when Working with Excel from R

##---------------------------------------------------------------
##                        add_file_date()                       -
##        add date and time stamp for file names                -
##---------------------------------------------------------------
add_file_date <- function(file_name) {
  file_name = paste0(file_name, "_", format(Sys.time(), format = "%Y%m%d-%H%M"), ".xlsx")
  return(file_name)
 
## Usage
## This will generates a file named "test_20210101-1501.xlsx"
write.xlsx(df, add_file_date("test"))

Combining added timestamp and cleaned columns:

##----------------------------------------------------------------
##                        clean_export()                         -
##            replace dot with space in column names             -
##            and export to excel with added timestamp           -
##----------------------------------------------------------------
library(dplyr)
library(stringr)
library(openxlsx)
 
clean_export <- function(data, file_name) {
  exportfile = write.xlsx(data %>%
                            rename_all( ~ str_replace_all(., \\., " ")), file = add_file_date(file_name)) # replace period with space in column names
  return(exportfile)
}
 
## Usage
clean_export(df, "test")

🪴 Aster's notebook

Recent updated

Explorer

Tidyverse cheatsheet

Remove . In Column Name function

Overlap value for multiple columns

Replace value

Summarize and count

Comparing row bind: base, dplyr and data.table

Get the max from sub group

Identify duplicate variables/columns

Select columns

Relocate column

String detect

Custom functions

Standardize column names as per template

Clean export to excel

Graph View

Table of Contents

Backlinks

🪴 Aster's notebook

Recent updated

Explorer

Tidyverse cheatsheet

Remove . In Column Name function §

Overlap value for multiple columns §

Replace value §

Summarize and count §

Comparing row bind: base, dplyr and data.table §

Get the max from sub group §

Identify duplicate variables/columns §

Select columns §

Relocate column §

String detect §

Custom functions §

Standardize column names as per template §

Clean export to excel §

Graph View

Table of Contents

Backlinks

Remove . In Column Name function

Overlap value for multiple columns

Replace value

Summarize and count

Comparing row bind: base, dplyr and data.table

Get the max from sub group

Identify duplicate variables/columns

Select columns

Relocate column

String detect

Custom functions

Standardize column names as per template

Clean export to excel