Link: R
Remove . In Column Name function
## On-the-fly
colnames(df) <- gsub("\\.", " ", colnames(df))
## Function
makeColNamesUserFriendly <- function(ds) {
# Convert any number of consecutive dots to a single space.
names(ds) <- gsub(x = names(ds),
pattern = "(\\.)+",
replacement = " ")
# Drop the trailing spaces.
names(ds) <- gsub(x = names(ds),
pattern = "( )+$",
replacement = "")
ds
}
## Usage
df <- makeColNamesUserFriendly(df)
Overlap value for multiple columns
Update:
See my blog post Replace missing value from other columns using coalesce join in dplyr for an advanced version of coalesce
function.
# Or match together a complete vector from missing pieces
y <- c(1, 2, NA, NA, 5)
z <- c(NA, NA, 3, 4, 5)
coalesce(y, z)
#> [1] 1 2 3 4 5
From <dplyr.tidyverse.org/reference/coalesce.html>
Replace value
mutate(sample = gsub("_x|\\.\\d+[A-Za-z]+", "", sample))
Replace_na()
Replace string if it contains certain value:
mutate(Owning_LOB = if_else(str_detect(Owning_LOB, "AMCB"), "AMCB - Commercial Banking", Owning_LOB))
Summarize and count
# count LOB
test <- master %>%
group_by(LOB) %>%
summarize(count = n())
df2 <- confirmed %>% count(Owning_LOB)
test2 <- master %>% filter(LOB != "Unknown")
# Count only
n_distinct(confirmed$Owning_LOB)
count <- n_distinct(test$Application.Code)
Comparing row bind: base, dplyr and data.table
Comparing base::rbind
vs dplyr::bind_rows
vs data.table::rbindlist
TL;DR: data.table::rbindlist
is the most powerful out of three.
base::rbind
can cope some mixed data types but cannot handle missing columns.
dplyr::bind_rows
can handle missing columns, but will fail to bind with mixed data types (e.g datetime to another) when there are some discrepancy.
data.table::rbindlist
can ignore different data types when bind rows:
(set use.names as FALSE to also ignore column names), such as:
all_inv <- data.table::rbindlist(list(last_inv_clean, this_inv), use.names = FALSE)
Get the max from sub group
Apply rules to LOR: Max > Min > PrivOnly
# Slice to keep the first row for duplicate apps
keepmax <- groupraw %>%
group_by(Application.Code) %>%
slice_max(Legal_Impact, n = 1)
Identify duplicate variables/columns
which(duplicated(names(this_df)))
Select columns
## Select last n columns:
tail(names(.), x)
## Select first and last 2 columns
select(1, tail(names(.), 2))
Relocate column
## Move column to the last
data %>% relocate(b, .after = last_col())
String detect
## From stringR
## If the first letter is AMCB:
filter(!str_extract(LOB, "^.{4}") == "AMCB")
Custom functions
Standardize column names as per template
##----------------------------------------------------------------
## col_dictionary() -
## standardize column names -
##----------------------------------------------------------------
## Read standardized column from a csv file, which contains old_name and new_name
col_dictionary <- read.csv("Data/cols_formatting.csv")
col_standardize <- function(data) {
for(i in 1:nrow(col_dictionary))
names(data)[names(data) == col_dictionary$old_name[i]] = col_dictionary$new_name[i]
return(data)
}
Clean export to excel
Add timestamp for export file when Working with Excel from R
##---------------------------------------------------------------
## add_file_date() -
## add date and time stamp for file names -
##---------------------------------------------------------------
add_file_date <- function(file_name) {
file_name = paste0(file_name, "_", format(Sys.time(), format = "%Y%m%d-%H%M"), ".xlsx")
return(file_name)
## Usage
## This will generates a file named "test_20210101-1501.xlsx"
write.xlsx(df, add_file_date("test"))
Combining added timestamp and cleaned columns:
##----------------------------------------------------------------
## clean_export() -
## replace dot with space in column names -
## and export to excel with added timestamp -
##----------------------------------------------------------------
library(dplyr)
library(stringr)
library(openxlsx)
clean_export <- function(data, file_name) {
exportfile = write.xlsx(data %>%
rename_all( ~ str_replace_all(., \\., " ")), file = add_file_date(file_name)) # replace period with space in column names
return(exportfile)
}
## Usage
clean_export(df, "test")